Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.apache.jackrabbit.core.query.lucene.SearchIndex.java

License:Apache License

/**
 * Merges the fulltext indexed fields of the aggregated node states into
 * <code>doc</code>./*  w  w w.jav a  2 s  .c om*/
 *
 * @param state the node state on which <code>doc</code> was created.
 * @param doc the lucene document with index fields from <code>state</code>.
 * @param ifv the current index format version.
 */
protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) {
    if (indexingConfig != null) {
        AggregateRule[] aggregateRules = indexingConfig.getAggregateRules();
        if (aggregateRules == null) {
            return;
        }
        try {
            ItemStateManager ism = getContext().getItemStateManager();
            for (AggregateRule aggregateRule : aggregateRules) {
                boolean ruleMatched = false;
                // node includes
                NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state);
                if (aggregates != null) {
                    ruleMatched = true;
                    for (NodeState aggregate : aggregates) {
                        Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv);
                        // transfer fields to doc if there are any
                        Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
                        if (fulltextFields != null) {
                            for (Fieldable fulltextField : fulltextFields) {
                                doc.add(fulltextField);
                            }
                            doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                    aggregate.getNodeId().toString(), Field.Store.NO,
                                    Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                        }
                    }
                    // make sure that fulltext fields are aligned properly
                    // first all stored fields, then remaining
                    Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT);
                    doc.removeFields(FieldNames.FULLTEXT);
                    Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED);
                    for (Fieldable f : fulltextFields) {
                        doc.add(f);
                    }
                }
                // property includes
                PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state);
                if (propStates != null) {
                    ruleMatched = true;
                    for (PropertyState propState : propStates) {
                        String namePrefix = FieldNames.createNamedValue(
                                getNamespaceMappings().translateName(propState.getName()), "");
                        NodeState parent = (NodeState) ism.getItemState(propState.getParentId());
                        Document aDoc = createDocument(parent, getNamespaceMappings(), ifv);
                        try {
                            // find the right fields to transfer
                            Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
                            for (Fieldable field : fields) {

                                // assume properties fields use SingleTokenStream
                                TokenStream tokenStream = field.tokenStreamValue();
                                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                                PayloadAttribute payloadAttribute = tokenStream
                                        .addAttribute(PayloadAttribute.class);
                                tokenStream.incrementToken();
                                tokenStream.end();
                                tokenStream.close();

                                String value = new String(termAttribute.termBuffer(), 0,
                                        termAttribute.termLength());
                                if (value.startsWith(namePrefix)) {
                                    // extract value
                                    String rawValue = value.substring(namePrefix.length());
                                    // create new named value
                                    Path p = getRelativePath(state, propState);
                                    String path = getNamespaceMappings().translatePath(p);
                                    value = FieldNames.createNamedValue(path, rawValue);
                                    termAttribute.setTermBuffer(value);
                                    PropertyMetaData pdm = PropertyMetaData
                                            .fromByteArray(payloadAttribute.getPayload().getData());
                                    doc.add(new Field(field.name(),
                                            new SingletonTokenStream(value, pdm.getPropertyType())));
                                    doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false,
                                            parent.getNodeId().toString(), Field.Store.NO,
                                            Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                                    if (pdm.getPropertyType() == PropertyType.STRING) {
                                        // add to fulltext index
                                        Field ft = new Field(FieldNames.FULLTEXT, false, rawValue,
                                                Field.Store.YES, Field.Index.ANALYZED_NO_NORMS,
                                                Field.TermVector.NO);
                                        doc.add(ft);
                                    }
                                }
                            }
                        } finally {
                            Util.disposeDocument(aDoc);
                        }
                    }
                }

                // only use first aggregate definition that matches
                if (ruleMatched) {
                    break;
                }
            }
        } catch (NoSuchItemStateException e) {
            // do not fail if aggregate cannot be created
            log.info("Exception while building indexing aggregate for {}. Node is not available {}.",
                    state.getNodeId(), e.getMessage());
        } catch (Exception e) {
            // do not fail if aggregate cannot be created
            log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e);
        }
    }
}

From source file:org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex.java

License:Apache License

/**
 * Tries to merge back tokens that are split on relevant fulltext query
 * wildcards ('*' or '?')/*from w w  w . j  a va2  s. com*/
 *
 *
 * @param text
 * @param analyzer
 * @return
 */
static List<String> tokenize(String text, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        stream.reset();

        int poz = 0;
        boolean hasFulltextToken = false;
        StringBuilder token = new StringBuilder();
        while (stream.incrementToken()) {
            String term = termAtt.toString();
            int start = offsetAtt.startOffset();
            int end = offsetAtt.endOffset();
            if (start > poz) {
                for (int i = poz; i < start; i++) {
                    for (char c : fulltextTokens) {
                        if (c == text.charAt(i)) {
                            token.append(c);
                            hasFulltextToken = true;
                        }
                    }
                }
            }
            poz = end;
            if (hasFulltextToken) {
                token.append(term);
                hasFulltextToken = false;
            } else {
                if (token.length() > 0) {
                    tokens.add(token.toString());
                }
                token = new StringBuilder();
                token.append(term);
            }
        }
        // consume to the end of the string
        if (poz < text.length()) {
            for (int i = poz; i < text.length(); i++) {
                for (char c : fulltextTokens) {
                    if (c == text.charAt(i)) {
                        token.append(c);
                    }
                }
            }
        }
        if (token.length() > 0) {
            tokens.add(token.toString());
        }
        stream.end();
    } catch (IOException e) {
        LOG.error("Building fulltext query failed", e.getMessage());
        return null;
    } finally {
        try {
            if (stream != null) {
                stream.close();
            }
        } catch (IOException e) {
            // ignore
        }
    }
    return tokens;
}

From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    StringBuilder contents = new StringBuilder();
    String document = value.toString();
    String catMatch = findMatchingCategory(document);

    if (!"Unknown".equals(catMatch)) {
        document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN
                .matcher(WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
                .replaceAll(""));
        TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        while (stream.incrementToken()) {
            contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
        }/* ww  w. ja  va  2 s.  c o m*/
        context.write(new Text(
                WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
                new Text(contents.toString()));
    }
}

From source file:org.apache.mahout.classifier.BayesFileFormatter.java

License:Apache License

/**
 * Write the tokens and the label from the Reader to the writer
 * /*from   ww w.  j a v a 2  s .c  o m*/
 * @param label
 *          The label
 * @param analyzer
 *          The analyzer to use
 * @param inFile
 *          the file to read and whose contents are passed to the analyzer
 * @param charset
 *          character encoding to assume when reading the input file
 * @param writer
 *          The Writer, is not closed by this method
 * @throws java.io.IOException
 *           if there was a problem w/ the reader
 */
private static void writeFile(String label, Analyzer analyzer, File inFile, Charset charset, Writer writer)
        throws IOException {
    Reader reader = new InputStreamReader(new FileInputStream(inFile), charset);
    try {
        TokenStream ts = analyzer.tokenStream(label, reader);
        writer.write(label);
        writer.write('\t'); // edit: Inorder to match Hadoop standard
        // TextInputFormat
        TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.termBuffer();
            int termLen = termAtt.termLength();
            writer.write(termBuffer, 0, termLen);
            writer.write(' ');
        }
    } finally {
        IOUtils.quietClose(reader);
    }
}

From source file:org.apache.mahout.classifier.BayesFileFormatter.java

License:Apache License

/**
 * Convert a Reader to a vector//  w w  w . j av a2 s.  c  o m
 * 
 * @param analyzer
 *          The Analyzer to use
 * @param reader
 *          The reader to feed to the Analyzer
 * @return An array of unique tokens
 */
public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
    TokenStream ts = analyzer.tokenStream("", reader);

    List<String> coll = new ArrayList<String>();
    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
        char[] termBuffer = termAtt.termBuffer();
        int termLen = termAtt.termLength();
        String val = new String(termBuffer, 0, termLen);
        coll.add(val);
    }
    return coll.toArray(new String[coll.size()]);
}

From source file:org.apache.mahout.classifier.NewsgroupHelper.java

License:Apache License

public static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();//w w w .j av a  2s. c o  m
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
    ts.end();
    Closeables.close(ts, true);
}

From source file:org.apache.mahout.classifier.sgd.NewsgroupHelper.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in,
        Multiset<String> overallCounts) throws IOException {
    TokenStream ts = analyzer.reusableTokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    ts.reset();//from   w  w w  .  j  ava  2  s .  com
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);//  w  w  w .  ja v a 2 s . co m
    }
    overallCounts.addAll(words);
}

From source file:org.apache.mahout.text.MailArchivesClusteringAnalyzerTest.java

License:Apache License

@Test
public void testAnalysis() throws Exception {
    Analyzer analyzer = new MailArchivesClusteringAnalyzer();

    String text = "A test message\n" + "atokenthatistoolongtobeusefulforclustertextanalysis\n"
            + "Mahout is a scalable, machine-learning LIBRARY\n"
            + "we've added some additional stopwords such as html, mailto, regards\t"
            + "apache_hadoop provides the foundation for scalability\n"
            + "www.nabble.com general-help@incubator.apache.org\n" + "public void int protected package";
    Reader reader = new StringReader(text);

    // if you change the text above, then you may need to change this as well
    // order matters too
    String[] expectedTokens = { "test", "mahout", "scalabl", "machin", "learn", "librari", "weve", "ad",
            "stopword", "apache_hadoop", "provid", "foundat", "scalabl" };

    TokenStream tokenStream = analyzer.tokenStream("test", reader);
    assertNotNull(tokenStream);//from w  w w .  ja va2s. c om
    tokenStream.reset();
    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    int e = 0;
    while (tokenStream.incrementToken() && e < expectedTokens.length) {
        assertEquals(expectedTokens[e++], termAtt.toString());
    }
    assertEquals(e, expectedTokens.length);
    tokenStream.end();
    tokenStream.close();
}

From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String document = value.toString();
    document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN
            .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
    String catMatch = findMatchingCategory(document);
    if (!"Unknown".equals(catMatch)) {
        StringBuilder contents = new StringBuilder(1000);
        TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();//  ww  w  .j av a2 s .c o m
        while (stream.incrementToken()) {
            contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
        }
        context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
                new Text(contents.toString()));
        stream.end();
        Closeables.close(stream, true);
    }
}