Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java

License:Apache License

/**
 * Merges the fulltext indexed fields of the aggregated node states into
 * <code>doc</code>./*  w  w w.jav a  2 s  .c om*/
 *
 * @param state the node state on which <code>doc</code> was created.
 * @param doc the lucene document with index fields from <code>state</code>.
 * @param ifv the current index format version.
 */
protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) {
    if (indexingConfig != null) {
        AggregateRule[] aggregateRules = indexingConfig.getAggregateRules();
        if (aggregateRules == null) {
            return;
        }
        try {
            ItemStateManager ism = getContext().getItemStateManager();
            for (AggregateRule aggregateRule : aggregateRules) {
                boolean ruleMatched = false;
                // node includes
                NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state);
                if (aggregates != null) {
                    ruleMatched = true;
                    for (NodeState aggregate : aggregates) {
                        Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv);
                        // transfer fields to doc if there are any
                        Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
                        if (fulltextFields != null) {
                            for (Fieldable fulltextField : fulltextFields) {
                                doc.add(fulltextField);
                            }
                            doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                    aggregate.getNodeId().toString(), Field.Store.NO,
                                    Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                        }
                    }
                    // make sure that fulltext fields are aligned properly
                    // first all stored fields, then remaining
                    Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT);
                    doc.removeFields(FieldNames.FULLTEXT);
                    Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED);
                    for (Fieldable f : fulltextFields) {
                        doc.add(f);
                    }
                }
                // property includes
                PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state);
                if (propStates != null) {
                    ruleMatched = true;
                    for (PropertyState propState : propStates) {
                        String namePrefix = FieldNames.createNamedValue(
                                getNamespaceMappings().translateName(propState.getName()), "");
                        NodeState parent = (NodeState) ism.getItemState(propState.getParentId());
                        Document aDoc = createDocument(parent, getNamespaceMappings(), ifv);
                        try {
                            // find the right fields to transfer
                            Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
                            for (Fieldable field : fields) {

                                // assume properties fields use SingleTokenStream
                                TokenStream tokenStream = field.tokenStreamValue();
                                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                                PayloadAttribute payloadAttribute = tokenStream
                                        .addAttribute(PayloadAttribute.class);
                                tokenStream.incrementToken();
                                tokenStream.end();
                                tokenStream.close();

                                String value = new String(termAttribute.termBuffer(), 0,
                                        termAttribute.termLength());
                                if (value.startsWith(namePrefix)) {
                                    // extract value
                                    String rawValue = value.substring(namePrefix.length());
                                    // create new named value
                                    Path p = getRelativePath(state, propState);
                                    String path = getNamespaceMappings().translatePath(p);
                                    value = FieldNames.createNamedValue(path, rawValue);
                                    termAttribute.setTermBuffer(value);
                                    PropertyMetaData pdm = PropertyMetaData
                                            .fromByteArray(payloadAttribute.getPayload().getData());
                                    doc.add(new Field(field.name(),
                                            new SingletonTokenStream(value, pdm.getPropertyType())));
                                    doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                            parent.getNodeId().toString(), Field.Store.NO,
                                            Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                                    if (pdm.getPropertyType() == PropertyType.STRING) {
                                        // add to fulltext index
                                        Field ft = new Field(FieldNames.FULLTEXT, false, rawValue,
                                                Field.Store.YES, Field.Index.ANALYZED_NO_NORMS,
                                                Field.TermVector.NO);
                                        doc.add(ft);
                                    }
                                }
                            }
                        } finally {
                            Util.disposeDocument(aDoc);
                        }
                    }
                }

                // only use first aggregate definition that matches
                if (ruleMatched) {
                    break;
                }
            }
        } catch (NoSuchItemStateException e) {
            // do not fail if aggregate cannot be created
            log.info("Exception while building indexing aggregate for {}. Node is not available {}.",
                    state.getNodeId(), e.getMessage());
        } catch (Exception e) {
            // do not fail if aggregate cannot be created
            log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e);
        }
    }
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java

License:Apache License

/**
 * Tries to merge back tokens that are split on relevant fulltext query
 * wildcards ('*' or '?')/*from w w  w . j  a va2  s. com*/
 *
 *
 * @param text
 * @param analyzer
 * @return
 */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        stream.reset();

        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return tokens;
}

From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    StringBuilder contents = new StringBuilder();
    String document = value.toString();
    String catMatch = findMatchingCategory(document);

    if (!"Unknown".equals(catMatch)) {
        document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN
                .matcher(WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
                .replaceAll(""));
        TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        while (stream.incrementToken()) {
            contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
        }/* ww  w. ja  va  2 s.  c o m*/
        context.write(new Text(
                WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
                new Text(contents.toString()));
    }
}

From source file:org.apache.mahout.classifier.BayesFileFormatter.java

License:Apache License

/**
 * Write the tokens and the label from the Reader to the writer
 * /*from   ww w.  j a v a 2  s .c  o m*/
 * @param label
 *          The label
 * @param analyzer
 *          The analyzer to use
 * @param inFile
 *          the file to read and whose contents are passed to the analyzer
 * @param charset
 *          character encoding to assume when reading the input file
 * @param writer
 *          The Writer, is not closed by this method
 * @throws java.io.IOException
 *           if there was a problem w/ the reader
 */
private static void writeFile(String label, Analyzer analyzer, File inFile, Charset charset, Writer writer)
        throws IOException {
    Reader reader = new InputStreamReader(new FileInputStream(inFile), charset);
    try {
        TokenStream ts = analyzer.tokenStream(label, reader);
        writer.write(label);
        writer.write('\t'); // edit: Inorder to match Hadoop standard
        // TextInputFormat
        TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.termBuffer();
            int termLen = termAtt.termLength();
            writer.write(termBuffer, 0, termLen);
            writer.write(' ');
        }
    } finally {
        IOUtils.quietClose(reader);
    }
}

From source file:org.apache.mahout.classifier.BayesFileFormatter.java

License:Apache License

/**
 * Convert a Reader to a vector//  w w  w . j av a2 s.  c  o m
 * 
 * @param analyzer
 *          The Analyzer to use
 * @param reader
 *          The reader to feed to the Analyzer
 * @return An array of unique tokens
 */
public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
    TokenStream ts = analyzer.tokenStream("", reader);

    List<String> coll = new ArrayList<String>();
    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        char[] termBuffer = termAtt.termBuffer();
        int termLen = termAtt.termLength();
        String val = new String(termBuffer, 0, termLen);
        coll.add(val);
    }
    return coll.toArray(new String[coll.size()]);
}

From source file:org.apache.mahout.classifier.NewsgroupHelper.java

License:Apache License

public static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();//w w w .j av a  2s. c o  m
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
    ts.end();
    Closeables.close(ts, true);
}

From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.reusableTokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from   w  w w  .  j  ava  2  s .  com
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);//  w  w  w .  ja v a 2 s . co m
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java

License:Apache License

@Test
public void testAnalysis() throws Exception {
    Analyzer analyzer = new MailArchivesClusteringAnalyzer();

    String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
            + "Mahout is a scalable, machine-learning LIBRARY\n"
            + "we've added some additional stopwords such as html, mailto, regards\t"
            + "apache_hadoop provides the foundation for scalability\n"
            + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package";
    Reader reader = new StringReader(text);

    // if you change the text above, then you may need to change this as well
    // order matters too
    String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
            "stopword", "apache_hadoop", "provid", "foundat", "scalabl" };

    TokenStream tokenStream = analyzer.tokenStream("test", reader);
    assertNotNull(tokenStream);//from w  w w .  ja va2s. c om
    tokenStream.reset();
    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    int e = 0;
    while (tokenStream.incrementToken() && e < expectedTokens.length) {
        assertEquals(expectedTokens[e++], termAtt.toString());
    }
    assertEquals(e, expectedTokens.length);
    tokenStream.end();
    tokenStream.close();
}

From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String document = value.toString();
    document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN
            .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
    String catMatch = findMatchingCategory(document);
    if (!"Unknown".equals(catMatch)) {
        StringBuilder contents = new StringBuilder(1000);
        TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();//  ww  w  .j av a2 s .c o m
        while (stream.incrementToken()) {
            contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
        }
        context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
                new Text(contents.toString()));
        stream.end();
        Closeables.close(stream, true);
    }
}