Example usage for org.apache.lucene.analysis.tokenattributes CharTermAttribute buffer

List of usage examples for org.apache.lucene.analysis.tokenattributes CharTermAttribute buffer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.tokenattributes CharTermAttribute buffer.

Prototype

public char[] buffer();

Source Link

Document

Returns the internal termBuffer character array which you can then directly alter.

Usage

From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();//w w w.ja  v  a  2s.c o  m
    reuse.length = 0;
    while (ts.incrementToken()) {
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /* current + word + separator */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }
    return reuse;
}

From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java

License:Apache License

public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
    TokenStream ts = analyzer.tokenStream("", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    // PositionIncrementAttribute posIncAtt =
    // ts.addAttribute(PositionIncrementAttribute.class);
    boolean phraseTerm = false;
    ts.reset();//from  www . j  av  a2  s  .  c om
    reuse.length = 0;
    while (ts.incrementToken()) {
        // System.out.println(text + " | " + termAtt.toString());
        int length = termAtt.length();
        if (length == 0) {
            throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
        }
        // if (posIncAtt.getPositionIncrement() != 1) {
        // throw new IllegalArgumentException("term: " + text +
        // " analyzed to a token with posinc != 1");
        // }
        reuse.grow(reuse.length + length + 1); /*
                                               * current + word +
                                               * separator
                                               */
        int end = reuse.offset + reuse.length;
        if (reuse.length > 0) {
            reuse.chars[end++] = 32; // space
            reuse.length++;
            phraseTerm = true;
        }
        System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
        reuse.length += length;
    }
    ts.end();
    ts.close();
    if (reuse.length == 0) {
        throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
    }

    if (phraseTerm) {
        reuse.grow(reuse.length + 2); /* current + word + separator */
        reuse.length += 2;
        char next = reuse.chars[0];
        for (int i = 0; i < reuse.length - 2; i++) {
            char tmp = reuse.chars[i + 1];
            reuse.chars[i + 1] = next;
            next = tmp;
        }
        reuse.chars[0] = '\"';
        reuse.chars[reuse.length - 1] = '\"';
    }
    return reuse;
}

From source file:com.billiger.solr.handler.component.QLTBComponent.java

License:Apache License

/**
 * Get analyzed version of the query string.
 *
 * This uses the analyzer for the configured FieldType for this
 * component to analyze and re-assemble the original query string.
 * If no queryFieldType is configured, the original query will be
 * returned.//from w w w .j  av a2  s  . c  o m
 *
 * This is used both in the prepare() stage of the component and
 * when reading the QLTB map data.
 */
String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(query));
    tokens.reset();
    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    tokens.end();
    tokens.close();
    return norm.toString();
}

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//  ww  w  . j  av  a2  s  .c  o m
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java

@Override
public void performWork(Path inputDoc, Path outputDir) {
    try {// w ww  . j av  a  2  s  .  c o  m
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder docName = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            String value = pair.getSecond().toString();
            docName.append(key);
            StringTuple document;
            try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) {
                CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                document = new StringTuple();
                while (stream.incrementToken()) {
                    if (termAtt.length() > 0) {
                        document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                    }
                }
                stream.end();
            }
            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(outputDir, docName.toString());
        // overwrite old vector file
        ClusterFileService.FS.delete(tokenizedSeq, true);
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;/*w  w  w. j  a  va2  s.c o m*/
    }
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java

License:Open Source License

public static String analyze(String str, Analyzer analyzer) throws IOException {
    if (analyzer == null) {
        return str;
    }//from w w  w.  j a va  2s.  c o  m
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(str));
    tokens.reset();

    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    return norm.toString();
}

From source file:com.doculibre.constellio.utils.AnalyzerUtils.java

License:Open Source License

public static String analyzePhrase(String phrase, boolean useStopWords) {
    if (StringUtils.isNotBlank(phrase)) {
        String analysedPhrase;// ww  w. j av a 2  s .co m
        Analyzer analyzer = getDefaultAnalyzer(useStopWords);

        StringBuilder norm = new StringBuilder();
        TokenStream tokens;
        try {
            tokens = analyzer.tokenStream("", new StringReader(phrase));
            tokens.reset();

            CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
            while (tokens.incrementToken()) {
                norm.append(termAtt.buffer(), 0, termAtt.length());
            }

            analysedPhrase = norm.toString().trim();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return analysedPhrase;
    } else {
        return phrase;
    }
}

From source file:com.ginobefunny.elasticsearch.plugins.synonym.service.SimpleSynonymMap.java

License:Apache License

private Set<String> analyze(String text) throws IOException {
    Set<String> result = new HashSet<String>();
    Analyzer analyzer = configuration.getAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();//  w ww  .j av  a 2  s  . co  m
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            if (posIncAtt.getPositionIncrement() != 1) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
            }

            result.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }

        ts.end();
        return result;
    }
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

private String tokenizerToString(Tokenizer tokenizer) throws Exception {
    OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class);
    PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class);
    CharTermAttribute term = (CharTermAttribute) tokenizer.addAttribute(CharTermAttribute.class);
    TypeAttribute type = (TypeAttribute) tokenizer.addAttribute(TypeAttribute.class);
    SemanticClassAttribute semanticClass = (SemanticClassAttribute) tokenizer
            .addAttribute(SemanticClassAttribute.class);
    PartOfSpeechAttribute pos = (PartOfSpeechAttribute) tokenizer.addAttribute(PartOfSpeechAttribute.class);

    StringBuilder result = new StringBuilder();
    while (tokenizer.incrementToken() == true) {
        result.append(new String(term.buffer(), 0, term.length())).append(":");
        result.append(type.type()).append(":");
        result.append(pos.partOfSpeech()).append(":");
        result.append(semanticClass.semanticClass()).append(":");
        result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":");
        result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":");
        result.append(String.valueOf(extOffset.startOffset())).append(":");
        result.append(String.valueOf(extOffset.endOffset()));
        result.append(",");
    }// ww w. j  a v a  2 s  .  com
    tokenizer.end();
    return result.toString();
}