Example usage for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer

List of usage examples for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer.

Prototype

StopAnalyzer

Source Link

Usage

From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer();

    List<String> result = new ArrayList<>();

    try {/*from w  ww  .  j  a v a 2s  .c om*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:com.faqit.similarity.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text. Can handle ngrams of any length
 * and also perform stop word removal before extraction
 * //from   w ww .  j a  va 2  s . co m
 * @param text
 *            the text that the ngrams should be extracted from
 * @param length
 *            the length of the ngrams
 * @param stopWords
 *            whether or not stopwords should be removed before extraction
 * @param overlap
 *            whether or not the ngrams should overlap
 */
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /*
     * If the minLength and maxLength are both 1, then we want unigrams Make
     * use of a StopAnalyzer when stopwords should be removed Make use of a
     * SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer();
        } else {
            analyzer = new SimpleAnalyzer();
        }
    } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once
             // again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a
            // hack to use
            // Lucene 2.4
            // since in 2.4
            // position
            // increments
            // weren't
            // preserved by
            // default.
            // Using a later
            // version puts
            // underscores
            // (_) in the
            // place of
            // removed stop
            // words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, "");
        }
    }

    // Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    // OffsetAttribute offsetAttribute =
    // tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    // int tokenCount = 0;
    tokenStream.reset();
    while (tokenStream.incrementToken()) {

        // int startOffset = offsetAttribute.startOffset();
        // int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); // The actual token
        // term
        nGrams.add(termToken); // Add all ngrams to the ngram LinkedList

        // If n-grams are not allowed to overlap, then increment to point of
        // no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    // Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:com.hurence.logisland.processor.MatchQuery.java

License:Apache License

@Override
public void init(final ProcessContext context) {

    keywordAnalyzer = new KeywordAnalyzer();
    standardAnalyzer = new StandardAnalyzer();
    stopAnalyzer = new StopAnalyzer();
    matchingRules = new HashMap<>();
    NumericQueryParser queryMatcher = new NumericQueryParser("field");

    // loop over dynamic properties to add rules
    for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
        if (!entry.getKey().isDynamic()) {
            continue;
        }/*from  w w w. j a  v  a2 s .c om*/

        final String name = entry.getKey().getName();
        final String query = entry.getValue();
        matchingRules.put(name, new MatchingRule(name, query));
    }

    try {
        monitor = new Monitor(queryMatcher, new TermFilteredPresearcher());

        // TODO infer numeric type here
        if (context.getPropertyValue(NUMERIC_FIELDS).isSet()) {
            final String[] numericFields = context.getPropertyValue(NUMERIC_FIELDS).asString().split(",");
            for (String numericField : numericFields) {
                queryMatcher.setNumericField(numericField);
            }
        }

        //monitor = new Monitor(new LuceneQueryParser("field"), new TermFilteredPresearcher());
        for (MatchingRule rule : matchingRules.values()) {
            MonitorQuery mq = new MonitorQuery(rule.getName(), rule.getQuery());
            monitor.update(mq);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:HW1.generateIndex.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {
    String filePath = "/Users/yangyang/Desktop/lucene/corpus";
    File folder = new File(filePath);
    File[] files = folder.listFiles();

    String[] fields = { "DOCNO", "HEAD", "BYLINE", "DATELINE", "TEXT" };
    ArrayList<HashMap<String, String>> documents = new ArrayList<HashMap<String, String>>();
    int num = 0;//w  ww  . j  a v a 2  s.  co m

    for (File file : files) {
        // read each file
        BufferedReader br = null;
        String line;

        try {
            br = new BufferedReader(new FileReader(file));
            String xmlRecords = "";
            while ((line = br.readLine()) != null) {
                // change "&" to "&amp" to avoid bug in parse XML
                if (line.contains("&")) {
                    line = line.replaceAll("&", "&amp;");
                }

                if (line.startsWith("<DOC>")) {
                    xmlRecords = line;

                } else if (line.startsWith("</DOC>")) {
                    xmlRecords += line;
                    // use ReadXMLFile.java to parse the XMLfile string
                    num += 1;
                    ReadXMLFile r = new ReadXMLFile();
                    HashMap<String, String> document = r.parse(xmlRecords, fields);
                    // System.out.println(document.toString());
                    documents.add(document);

                } else {
                    xmlRecords += line + " ";
                }

            }
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    System.out.println(num);

    String[] indexPaths = { "/Users/yangyang/Desktop/lucene/index/index01",
            "/Users/yangyang/Desktop/lucene/index/index02", "/Users/yangyang/Desktop/lucene/index/index03",
            "/Users/yangyang/Desktop/lucene/index/index04", };
    for (String indexPath : indexPaths) {
        try {
            System.out.println("Indexing to directory '" + indexPath + "'...");

            Directory dir = FSDirectory.open(Paths.get(indexPath));
            Analyzer analyzer = null;
            if (indexPath.endsWith("1")) {
                analyzer = new KeywordAnalyzer();
            } else if (indexPath.endsWith("2")) {
                analyzer = new SimpleAnalyzer();
            } else if (indexPath.endsWith("3")) {
                analyzer = new StopAnalyzer();
            } else if (indexPath.endsWith("4")) {
                analyzer = new StandardAnalyzer();
            }

            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

            iwc.setOpenMode(OpenMode.CREATE);

            IndexWriter writer = new IndexWriter(dir, iwc);

            for (HashMap<String, String> doc : documents) {
                indexDoc(writer, doc);
            }

            writer.close();
        } catch (IOException e) {
            System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
        }
    }

}

From source file:index.ForwardIndex.java

public Map<String, List<String>> createForwardIndex() {
    try {// w ww. jav a  2 s.c  o m
        List<String> segment = new ArrayList<String>();
        String query = "select * from pages";
        ResultSet rs = db.executeQuery(query);
        String url;
        String file;

        int offset = 0;
        System.out.println("Start creating forward index");

        while (rs.next()) {
            url = rs.getString("url");
            file = rs.getString("raw");
            offset = Integer.parseInt(rs.getString("offset"));
            String content = analyser.getContent(file, offset);

            Analyzer analyzer = new StopAnalyzer();
            segment = LuceneUtil.tokenizeString(analyzer, content);

            indexMap.put(url, segment);
        }
        rs.close();
    } catch (SQLException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }
    System.out.println("Finished, the size of forward index is " + indexMap.size());
    return indexMap;
}

From source file:irlucene.Main.java

public static void main(String args[]) {
    metricsMeanCFC(new StopAnalyzer(), 0);
    //metricsMeanMED(new StopAnalyzer());
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static void analyzerByStop(String sentence) throws Exception {
    Analyzer analyzer = new StopAnalyzer();

    TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*w  ww . j a  v a  2  s .c om*/
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream
                .getAttribute(CharTermAttribute.class);
        System.out.print(charTermAttribute.toString() + " ,");
    }

    analyzer.close();
}

From source file:org.hibernate.search.test.envers.SearchAndEnversIntegrationTest.java

License:LGPL

private Query createLuceneQuery(String term, String value) {
    String searchQuery = term + ":" + value;
    QueryParser parser = new QueryParser(term, new StopAnalyzer());
    Query luceneQuery;//from w  w  w  .j  a  va 2 s. c o  m
    try {
        luceneQuery = parser.parse(searchQuery);
    } catch (ParseException e) {
        throw new RuntimeException("Unable to parse query", e);
    }
    return luceneQuery;
}

From source file:query.Response.java

public Response() {
    invertedIndex = new InvertedIndex();
    invertedIndexMap = invertedIndex.createInvertedIndex();
    analyzer = new StopAnalyzer();
    resultGetter = new ResultGetter();
}

From source file:stroom.index.server.analyzer.AnalyzerFactory.java

License:Apache License

public static Analyzer create(final AnalyzerType analyzerType, final boolean caseSensitive) {
    switch (analyzerType) {
    case KEYWORD:
        return new KeywordAnalyzer(caseSensitive);
    case ALPHA:/*  ww  w.  j ava 2s .  c o m*/
        return new AlphaAnalyzer(caseSensitive);
    case ALPHA_NUMERIC:
        return new AlphaNumericAnalyzer(caseSensitive);
    case NUMERIC:
        return new NumericAnalyzer();
    case WHITESPACE:
        return new WhitespaceAnalyzer();
    case STOP:
        return new StopAnalyzer();
    case STANDARD:
        return new StandardAnalyzer();
    }

    return new KeywordAnalyzer(true);
}