Example usage for org.apache.lucene.analysis CharArraySet CharArraySet

List of usage examples for org.apache.lucene.analysis CharArraySet CharArraySet

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet CharArraySet.

Prototype

public CharArraySet(Collection<?> c, boolean ignoreCase) 

Source Link

Document

Creates a set from a Collection of objects.

Usage

From source file:org.karsha.tokenize.DefaultTokenizer.java

License:Open Source License

public DefaultTokenizer() {

    this.lu_stop_words = new CharArraySet(10, false);
    this.te_stop_words = new CharArraySet(10, false);
    for (int i = 0; i < LUCENE_STOP_WORDS.length; i++) {
        lu_stop_words.add(LUCENE_STOP_WORDS[i]);
    }/*from  w  w  w. ja v a 2 s  . co m*/
    for (int i = 0; i < TERRIER_STOP_WORDS.length; i++) {
        te_stop_words.add(TERRIER_STOP_WORDS[i]);
    }

}

From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java

License:Open Source License

private TokenStream getStopFilter(String lang, Set<String> metadataStopWords, TokenStream stream) {

    if (metadataStopWords != null && !metadataStopWords.isEmpty()) {
        return new StopFilter(stream, new CharArraySet(metadataStopWords, false));

    } else {/*from w  ww . j a v a2  s  .co  m*/
        try {
            Set<String> sws = getDefaultStopWordSet(lang);

            if (sws != null) {
                CharArraySet stopWords = new CharArraySet(30, true);
                stopWords.addAll(sws);
                return new StopFilter(stream, stopWords);
            }
        } catch (IndraException e) {
            throw new IndraRuntimeException(String.format("Error creating stop filter for lang '%s'", lang), e);
        }
    }
    return stream;
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

public List<TermIDF> contrast(Query query, String fieldName, int numResults) throws IOException {
    TopScoreDocCollector results = TopScoreDocCollector.create(maxDocs, maxDocs + 10000);
    searcher.search(query, results);// w  w  w  . j  a  va2s .co m

    ScoreDoc[] scoreDocs = results.topDocs().scoreDocs;
    //if there are fewer documents than minTermFreq
    //return empty list now
    if (scoreDocs.length < minTermFreq) {
        return new ArrayList<TermIDF>();
    }

    //total hack
    int initialSize = scoreDocs.length * 100;
    CharArrayMap<MutableValueInt> map = new CharArrayMap<MutableValueInt>(initialSize, ignoreCase);
    CharArraySet tmpSet = new CharArraySet(100, ignoreCase);
    Set<String> selector = new HashSet<String>();
    selector.add(fieldName);

    for (ScoreDoc scoreDoc : scoreDocs) {
        //get terms from doc
        processDoc(scoreDoc.doc, fieldName, selector, tmpSet);
        //now update global doc freqs
        Iterator<Object> it = tmpSet.iterator();
        while (it.hasNext()) {
            char[] token = (char[]) it.next();
            MutableValueInt docCount = map.get(token, 0, token.length);
            if (docCount == null) {
                docCount = new MutableValueInt();
                docCount.value = 1;
            } else {
                docCount.value++;
            }
            map.put(token, docCount);
        }
        tmpSet.clear();
    }

    return getResults(fieldName, map, numResults);
}

From source file:reviews.indexing.IndexReviews.java

License:Apache License

private static CharArraySet readStopWords(String filename) {

    CharArraySet stopwords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);

    try {//from  www .  j a v  a  2 s  .co m
        BufferedReader br;
        br = new BufferedReader(new FileReader(new File(filename)));
        String line;
        while ((line = br.readLine()) != null) {
            stopwords.add(line.trim());
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return stopwords;
}

From source file:ri.AnalyzerNuevo.java

@Override
protected TokenStreamComponents createComponents(String string) {

    //To change body of generated methods, choose Tools | Templates.
    final Tokenizer source = new StandardTokenizer();

    Reader reader = new StringReader(string);

    source.setReader(reader);//w  w  w .  ja va2  s .  c  om

    //SynonymMap.Builder builder = new SynonymMap.Builder(true);
    //builder.add(new CharsRef("text"), new CharsRef("documento"), true);

    //SynonymMap synonymMap;

    TokenStream pipeline = source;
    pipeline = new StandardFilter(pipeline);

    pipeline = new EnglishPossessiveFilter(pipeline);
    /*try {
    synonymMap = builder.build();
    pipeline = new SynonymFilter(pipeline,synonymMap,true);
    } catch (IOException ex) {
       Logger.getLogger(AnalyzerNuevo.class.getName()).log(Level.SEVERE, null, ex);
    }*/

    pipeline = new ASCIIFoldingFilter(pipeline);
    pipeline = new LowerCaseFilter(pipeline);
    pipeline = new StopFilter(pipeline, new CharArraySet(stopwords, true));
    pipeline = new PorterStemFilter(pipeline);

    return new TokenStreamComponents(source, pipeline);
}

From source file:tarefa1.IndexFiles.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles"
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
            + "in INDEX_PATH that can be searched with SearchFiles";
    String indexPath = "index";
    //String docsPath = null; -- pega o caminho do main
    boolean create = true;
    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            indexPath = args[i + 1];//from   ww w  .  ja  v  a 2 s.  com
            i++;
        } else if ("-docs".equals(args[i])) {
            docsPath = args[i + 1];
            i++;
        } else if ("-update".equals(args[i])) {
            create = false;
        }
    }

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final Path docDir = Paths.get(docsPath);
    if (!Files.isReadable(docDir)) {
        System.out.println("Document directory '" + docDir.toAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(Paths.get(indexPath));
        //Analyzer analyzer = new StandardAnalyzer();
        Analyzer analyzer;
        if (stoplist) {
            if (stemming) {
                analyzer = new EnglishAnalyzer();
            } else {
                analyzer = new StandardAnalyzer();
            }
        } else {
            String[] stopWordsArray = {};
            CharArraySet stopWords = new CharArraySet(2, true);
            stopWords.addAll(Arrays.asList(stopWordsArray));
            if (stemming) {
                analyzer = new EnglishAnalyzer(stopWords);
            } else {
                analyzer = new StandardAnalyzer(stopWords);
            }
        }

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:tarefa1.SearchFiles.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details.";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
        System.out.println(usage);
        System.exit(0);/* w w w  .  j  a  v  a2s .c o m*/
    }

    String index = "index";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) {
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-paging".equals(args[i])) {
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        }
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer;
    if (stoplist) {
        if (stemming) {
            analyzer = new EnglishAnalyzer();
        } else {
            analyzer = new StandardAnalyzer();
        }
    } else {
        String[] stopWordsArray = {};
        CharArraySet stopWords = new CharArraySet(2, true);
        stopWords.addAll(Arrays.asList(stopWordsArray));
        if (stemming) {
            analyzer = new EnglishAnalyzer(stopWords);
        } else {
            analyzer = new StandardAnalyzer(stopWords);
        }
    }

    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}