com.searchcode.app.util.CodeAnalyzer.java Source code

Introduction

Here is the source code for com.searchcode.app.util.CodeAnalyzer.java
Source

/*
 * Copyright (c) 2016 Boyter Online Services
 *
 * Use of this software is governed by the Fair Source License included
 * in the LICENSE.TXT file, but will be eventually open under GNU General Public License Version 3
 * see the README.md for when this clause will take effect
 *
 * Version 1.3.10
 */

package com.searchcode.app.util;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.util.AttributeFactory;

import java.io.IOException;
import java.io.StringReader;

/**
 * Custom implementation of Lucene Analyzer where we limit to strings 100 characters log.
 * A few other things going on in here.
 * TODO add more details about whats going on
 */
public class CodeAnalyzer extends Analyzer {

    public CodeAnalyzer() {
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
        final Tokenizer source = new CodeTokenizer();
        //TokenStream result = new LengthFilter(source, 0, Integer.MAX_VALUE);
        TokenStream result = new LengthFilter(source, 0, 100); // should be enough I hope
        return new TokenStreamComponents(source, result);
    }

    public static void main(String[] args) throws IOException {
        // text to tokenize
        final String text = "This is a demo of the TokenStream API";

        CodeAnalyzer analyzer = new CodeAnalyzer();
        TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

        // get the CharTermAttribute from the TokenStream
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

        try {
            stream.reset();

            // print all tokens until stream is exhausted
            while (stream.incrementToken()) {
                System.out.println(termAtt.toString());
            }

            stream.end();
        } finally {
            stream.close();
        }
    }
}

final class CodeTokenizer extends CharTokenizer {
    public CodeTokenizer() {
    }

    public CodeTokenizer(AttributeFactory factory) {
        super(factory);
    }

    // TODO possible performance issue here because its called so much, maybe cache the results so we can test it more quickly
    protected boolean isTokenChar(int c) {
        return !Character.isWhitespace(c);
    }
}

final class LengthFilter extends FilteringTokenFilter {

    private final int min;
    private final int max;

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

    /**
     * Create a new LengthFilter. This will filter out tokens whose
     * CharTermAttribute is either too short
     * (< min) or too long (> max).
     * @param in      the TokenStream to consume
     * @param min     the minimum length
     * @param max     the maximum length
     */
    public LengthFilter(TokenStream in, int min, int max) {
        super(in);
        this.min = min;
        this.max = max;
    }

    @Override
    public boolean accept() {
        final int len = termAtt.length();
        return (len >= min && len <= max);
    }

}