Example usage for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer

List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer.

Prototype

public EnglishAnalyzer(CharArraySet stopwords) 

Source Link

Document

Builds an analyzer with the given stop words.

Usage

From source file:IrqaQuery.java

License:Apache License

public static void makeIndexWriter(String indexPath, String stopPath, String sim) throws IOException {
    System.out.println("[makeIndexWriter] started");
    System.out.println("[makeIndexWriter]" + stopPath);
    Directory dir = FSDirectory.open(Paths.get(indexPath));
    Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stopPath)));
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

    if (sim.equals("TFIDF"))
        iwc.setSimilarity(new ClassicSimilarity());
    else if (sim.equals("BM25"))
        iwc.setSimilarity(new BM25Similarity());
    else/*from   w  ww .ja  v  a  2 s .com*/
        iwc.setSimilarity(new BM25Similarity());

    writer = new IndexWriter(dir, iwc);
}

From source file:IrqaQuery.java

License:Apache License

public static List<Document> query(String index, String stoppath, String question, int numResult, String sim)
        throws Exception {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);

    Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath)));

    if (sim.equals("TFIDF"))
        searcher.setSimilarity(new ClassicSimilarity());
    else if (sim.equals("BM25"))
        searcher.setSimilarity(new BM25Similarity());
    else/*from   w  w  w  .  j  av  a 2s .  co m*/
        searcher.setSimilarity(new BM25Similarity());

    String field = "contents";
    QueryParser parser = new QueryParser(field, analyzer);
    Query query = parser.parse(parser.escape(question));

    TopDocs results = searcher.search(query, numResult);
    ScoreDoc[] hits = results.scoreDocs;
    List<Document> docs = new ArrayList<Document>();

    int numTotalHits = results.totalHits;
    //        System.out.println(numTotalHits + " total matching documents");

    int end = Math.min(numTotalHits, numResult);

    String searchResult = "";
    //        System.out.println("Only results 1 - " + hits.length);

    for (int i = 0; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        docs.add(doc);
    }

    return docs;
}

From source file:luceneInterface.java

License:Apache License

public static List<Document> query(String index, String stoppath, String question, int numResult, String sim)
        throws Exception {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);

    Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath)));

    if (sim.equals("TFIDF"))
        searcher.setSimilarity(new ClassicSimilarity());
    else if (sim.equals("BM25"))
        searcher.setSimilarity(new BM25Similarity());
    else/*from w  ww  . j a  va  2s  . com*/
        searcher.setSimilarity(new BM25Similarity());

    String field = "contents";
    QueryParser parser = new QueryParser(field, analyzer);
    Query query = parser.parse(parser.escape(question));

    BooleanQuery.Builder bqb = new BooleanQuery.Builder();
    bqb.add(new TermQuery(new Term("contents", parser.escape(question))), BooleanClause.Occur.SHOULD);
    bqb.add(new TermQuery(new Term("sec", parser.escape(question))), BooleanClause.Occur.SHOULD);

    //        Term term = new Term(field, question);
    //        Query query = new TermQuery(term);

    //        TopDocs results = searcher.search(query, numResult);
    TopDocs results = searcher.search(parser.parse(bqb.build().toString()), numResult);

    ScoreDoc[] hits = results.scoreDocs;
    List<Document> docs = new ArrayList<Document>();

    int numTotalHits = results.totalHits;
    //        System.out.println(numTotalHits + " total matching documents");

    int end = Math.min(numTotalHits, numResult);

    String searchResult = "";
    //        System.out.println("Only results 1 - " + hits.length);

    for (int i = 0; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        docs.add(doc);
    }

    return docs;
}

From source file:ai.castor.idf.FetchTermIDF.java

License:Apache License

public double getTermIDF(String term) throws ParseException {
    Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String esTerm = qp.escape(term);
    double termIDF = 0.0;
    try {/*from   w  ww . j ava2  s  . com*/
        TermQuery q = (TermQuery) qp.parse(esTerm);
        Term t = q.getTerm();
        termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());

        System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF);
    } catch (Exception e) {
        System.err.println("Exception in fetching IDF(" + term + "): " + e.toString());
    }
    return termIDF;
}

From source file:ai.castor.idf.IDFScorer.java

License:Apache License

public double calcIDF(String query, String answer, boolean analyze) throws ParseException {
    Analyzer analyzer;/*w  w  w .  j a  v  a  2 s  . co  m*/
    if (analyze) {
        analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    } else {
        analyzer = new WhitespaceAnalyzer();
    }

    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+")));

    double idf = 0.0;
    HashSet<String> seenTerms = new HashSet<>();

    String[] terms = answer.split("\\s+");
    for (String term : terms) {
        try {
            TermQuery q = (TermQuery) qp.parse(term);
            Term t = q.getTerm();

            if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                idf += similarity.idf(reader.docFreq(t), reader.numDocs());
                seenTerms.add(t.toString());
            } else {
                idf += 0.0;
            }
        } catch (Exception e) {
            continue;
        }
    }
    return idf;
}

From source file:ca.mcgill.cs.creco.logic.search.CategorySearch.java

License:Apache License

/**
 * Constructor./*from ww  w  . j  av  a 2s . c  om*/
 * @param pDataStore The database whose categories will be in the search index. 
 * @throws IOException If an exception is thrown during the creation of the product index.
 */
@Autowired
public CategorySearch(IDataStore pDataStore) throws IOException {
    aDirectory = new RAMDirectory();
    aAnalyzer = new EnglishAnalyzer(VERSION);
    aDataStore = pDataStore;

    buildCategoryIndex();
}

From source file:ch.admin.isb.hermes5.business.search.AnalyserRepository.java

License:Apache License

public Analyzer getAnalyzer(String lang) {
    if (lang.equals("fr")) {
        return new FrenchAnalyzer(Version.LUCENE_47);
    }//www .j a  va2 s.c om
    if (lang.equals("it")) {
        return new ItalianAnalyzer(Version.LUCENE_47);
    }
    if (lang.equals("en")) {
        return new EnglishAnalyzer(Version.LUCENE_47);
    }
    return new GermanAnalyzer(Version.LUCENE_47);
}

From source file:com.tamingtext.classifier.mlt.MoreLikeThisQueryTest.java

License:Apache License

@Test
public void testMoreLikeThisQuery() throws Exception {
    //<start id="lucene.examples.mlt.setup"/>
    Directory directory = FSDirectory.open(new File(modelPath));

    IndexReader indexReader = IndexReader.open(directory); //<co id="mlt.indexsetup"/>
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);

    Analyzer analyzer //<co id="mlt.analyzersetup"/>
            = new EnglishAnalyzer(Version.LUCENE_36);

    if (nGramSize > 1) { //<co id="mlt.ngramsetup"/>
        analyzer = new ShingleAnalyzerWrapper(analyzer, nGramSize, nGramSize);
    }//from  w  ww. j  a v  a  2  s . c o  m

    MoreLikeThis moreLikeThis = new MoreLikeThis(indexReader); //<co id="mlt.configure"/>
    moreLikeThis.setAnalyzer(analyzer);
    moreLikeThis.setFieldNames(new String[] { "content" });

    /*<calloutlist>
    <callout arearefs="mlt.indexsetup">Open Index</callout>
    <callout arearefs="mlt.analyzersetup">Setup Analyzer</callout>
    <callout arearefs="mlt.ngramsetup">Setup NGrams</callout>
    <callout arearefs="mlt.configure">Create <classname>MoreLikeThis</classname></callout>
    </calloutlist>*/
    //<end id="lucene.examples.mlt.setup"/>

    // for testing against the same corpus
    moreLikeThis.setMinTermFreq(1);
    moreLikeThis.setMinDocFreq(1);

    //<start id="lucene.examples.mlt.query"/>
    Reader reader = new FileReader(inputPath); //<co id="mlt.query"/>
    Query query = moreLikeThis.like(reader);

    TopDocs results = indexSearcher.search(query, maxResults); //<co id="mlt.search"/>

    HashMap<String, CategoryHits> categoryHash = new HashMap<String, CategoryHits>();

    for (ScoreDoc sd : results.scoreDocs) { //<co id="mlt.collect"/>
        Document d = indexReader.document(sd.doc);
        Fieldable f = d.getFieldable(categoryFieldName);
        String cat = f.stringValue();
        CategoryHits ch = categoryHash.get(cat);
        if (ch == null) {
            ch = new CategoryHits();
            ch.setLabel(cat);
            categoryHash.put(cat, ch);
        }
        ch.incrementScore(sd.score);
    }

    SortedSet<CategoryHits> sortedCats //<co id="mlt.rank"/>
            = new TreeSet<CategoryHits>(CategoryHits.byScoreComparator());
    sortedCats.addAll(categoryHash.values());

    for (CategoryHits c : sortedCats) { //<co id="mlt.display"/>
        System.out.println(c.getLabel() + "\t" + c.getScore());
    }
    /*<calloutlist>
    <callout arearefs="mlt.query">Create Query</callout>
    <callout arearefs="mlt.search">Perform Search</callout>
    <callout arearefs="mlt.collect">Collect Results</callout>
    <callout arearefs="mlt.rank">Rank Categories</callout>
    <callout arearefs="mlt.display">Display Categories</callout>
    </calloutlist>*/
    //<end id="lucene.examples.mlt.query"/>

}

From source file:com.tamingtext.classifier.mlt.TestMoreLikeThis.java

License:Apache License

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory").withShortName("i").create();

    Option modelOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("The directory containing the index model").withShortName("m").create();

    Option categoryFieldOpt = obuilder.withLongName("categoryField").withRequired(true)
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("Name of the field containing category information").withShortName("catf")
            .create();//from  w  w w  .  ja  v  a  2s. com

    Option contentFieldOpt = obuilder.withLongName("contentField").withRequired(true)
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("Name of the field containing content information").withShortName("contf")
            .create();

    Option maxResultsOpt = obuilder.withLongName("maxResults").withRequired(false)
            .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of results to retrive, default: 10 ").withShortName("r").create();

    Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false)
            .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("Size of the n-gram. Default Value: 1 ").withShortName("ng").create();

    Option typeOpt = obuilder.withLongName("classifierType").withRequired(false)
            .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create())
            .withDescription("Type of classifier: knn|tfidf. Default: bayes").withShortName("type").create();

    Group group = gbuilder.withName("Options").withOption(gramSizeOpt).withOption(helpOpt)
            .withOption(inputDirOpt).withOption(modelOpt).withOption(typeOpt).withOption(contentFieldOpt)
            .withOption(categoryFieldOpt).withOption(maxResultsOpt).create();

    try {
        Parser parser = new Parser();

        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String classifierType = (String) cmdLine.getValue(typeOpt);

        int gramSize = 1;
        if (cmdLine.hasOption(gramSizeOpt)) {
            gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt));
        }

        int maxResults = 10;
        if (cmdLine.hasOption(maxResultsOpt)) {
            maxResults = Integer.parseInt((String) cmdLine.getValue(maxResultsOpt));
        }

        String inputPath = (String) cmdLine.getValue(inputDirOpt);
        String modelPath = (String) cmdLine.getValue(modelOpt);
        String categoryField = (String) cmdLine.getValue(categoryFieldOpt);
        String contentField = (String) cmdLine.getValue(contentFieldOpt);

        MatchMode mode;

        if ("knn".equalsIgnoreCase(classifierType)) {
            mode = MatchMode.KNN;
        } else if ("tfidf".equalsIgnoreCase(classifierType)) {
            mode = MatchMode.TFIDF;
        } else {
            throw new IllegalArgumentException("Unkown classifierType: " + classifierType);
        }

        Directory directory = FSDirectory.open(new File(modelPath));
        IndexReader indexReader = IndexReader.open(directory);
        Analyzer analyzer //<co id="mlt.analyzersetup"/>
                = new EnglishAnalyzer(Version.LUCENE_36);

        MoreLikeThisCategorizer categorizer = new MoreLikeThisCategorizer(indexReader, categoryField);
        categorizer.setAnalyzer(analyzer);
        categorizer.setMatchMode(mode);
        categorizer.setFieldNames(new String[] { contentField });
        categorizer.setMaxResults(maxResults);
        categorizer.setNgramSize(gramSize);

        File f = new File(inputPath);
        if (!f.isDirectory()) {
            throw new IllegalArgumentException(f + " is not a directory or does not exit");
        }

        File[] inputFiles = FileUtil.buildFileList(f);

        String line = null;
        //<start id="lucene.examples.mlt.test"/>
        final ClassifierResult UNKNOWN = new ClassifierResult("unknown", 1.0);

        ResultAnalyzer resultAnalyzer = //<co id="co.mlt.ra"/>
                new ResultAnalyzer(categorizer.getCategories(), UNKNOWN.getLabel());

        for (File ff : inputFiles) { //<co id="co.mlt.read"/>
            BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(ff), "UTF-8"));
            while ((line = in.readLine()) != null) {
                String[] parts = line.split("\t");
                if (parts.length != 2) {
                    continue;
                }

                CategoryHits[] hits //<co id="co.mlt.cat"/>
                        = categorizer.categorize(new StringReader(parts[1]));
                ClassifierResult result = hits.length > 0 ? hits[0] : UNKNOWN;
                resultAnalyzer.addInstance(parts[0], result); //<co id="co.mlt.an"/>
            }

            in.close();
        }

        System.out.println(resultAnalyzer.toString());//<co id="co.mlt.print"/>
        /*
        <calloutlist>
          <callout arearefs="co.mlt.ra">Create <classname>ResultAnalyzer</classname></callout>
          <callout arearefs="co.mlt.read">Read Test data</callout>
          <callout arearefs="co.mlt.cat">Categorize</callout>
          <callout arearefs="co.mlt.an">Collect Results</callout>
          <callout arearefs="co.mlt.print">Display Results</callout>
        </calloutlist>
        */
        //<end id="lucene.examples.mlt.test"/>
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
    }
}

From source file:com.tamingtext.classifier.mlt.TrainMoreLikeThis.java

License:Apache License

protected void openIndexWriter(String pathname) throws IOException {
    //<start id="lucene.examples.index.setup"/>
    Directory directory //<co id="luc.index.dir"/>
            = FSDirectory.open(new File(pathname));
    Analyzer analyzer //<co id="luc.index.analyzer"/>
            = new EnglishAnalyzer(Version.LUCENE_36);

    if (nGramSize > 1) { //<co id="luc.index.shingle"/>
        ShingleAnalyzerWrapper sw = new ShingleAnalyzerWrapper(analyzer, nGramSize, // min shingle size
                nGramSize, // max shingle size
                "-", // token separator
                true, // output unigrams
                true); // output unigrams if no shingles
        analyzer = sw;//w w  w  . ja va  2 s  .co  m
    }

    IndexWriterConfig config //<co id="luc.index.create"/>
            = new IndexWriterConfig(Version.LUCENE_36, analyzer);
    config.setOpenMode(OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(directory, config);
    /* <calloutlist>
    <callout arearefs="luc.index.dir">Create Index Directory</callout>
    <callout arearefs="luc.index.analyzer">Setup Analyzer</callout>
    <callout arearefs="luc.index.shingle">Setup Shingle Filter</callout>
    <callout arearefs="luc.index.create">Create <classname>IndexWriter</classname></callout>
    </calloutlist> */
    //<end id="lucene.examples.index.setup"/>
    this.writer = writer;
}