Example usage for org.apache.lucene.analysis CharArraySet CharArraySet

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet CharArraySet.

Prototype

public CharArraySet(Collection<?> c, boolean ignoreCase)

Source Link

Document

Creates a set from a Collection of objects.

Usage

From source file:org.karsha.tokenize.DefaultTokenizer.java

License:Open Source License

public DefaultTokenizer() {

    this.lu_stop_words = new CharArraySet(10, false);
    this.te_stop_words = new CharArraySet(10, false);
    for (int i = 0; i < LUCENE_STOP_WORDS.length; i++) {
        lu_stop_words.add(LUCENE_STOP_WORDS[i]);
    }/*from  w  w  w. ja v a 2 s  . co m*/
    for (int i = 0; i < TERRIER_STOP_WORDS.length; i++) {
        te_stop_words.add(TERRIER_STOP_WORDS[i]);
    }

}

From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java

License:Open Source License

private TokenStream getStopFilter(String lang, Set<String> metadataStopWords, TokenStream stream) {

    if (metadataStopWords != null && !metadataStopWords.isEmpty()) {
        return new StopFilter(stream, new CharArraySet(metadataStopWords, false));

    } else {/*from w  ww . j a v a2  s  .co  m*/
        try {
            Set<String> sws = getDefaultStopWordSet(lang);

            if (sws != null) {
                CharArraySet stopWords = new CharArraySet(30, true);
                stopWords.addAll(sws);
                return new StopFilter(stream, stopWords);
            }
        } catch (IndraException e) {
            throw new IndraRuntimeException(String.format("Error creating stop filter for lang '%s'", lang), e);
        }
    }
    return stream;
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

public List<TermIDF> contrast(Query query, String fieldName, int numResults) throws IOException {
    TopScoreDocCollector results = TopScoreDocCollector.create(maxDocs, maxDocs + 10000);
    searcher.search(query, results);// w  w  w  . j  a  va2s .co m

    ScoreDoc[] scoreDocs = results.topDocs().scoreDocs;
    //if there are fewer documents than minTermFreq
    //return empty list now
    if (scoreDocs.length < minTermFreq) {
        return new ArrayList<TermIDF>();
    }

    //total hack
    int initialSize = scoreDocs.length * 100;
    CharArrayMap<MutableValueInt> map = new CharArrayMap<MutableValueInt>(initialSize, ignoreCase);
    CharArraySet tmpSet = new CharArraySet(100, ignoreCase);
    Set<String> selector = new HashSet<String>();
    selector.add(fieldName);

    for (ScoreDoc scoreDoc : scoreDocs) {
        //get terms from doc
        processDoc(scoreDoc.doc, fieldName, selector, tmpSet);
        //now update global doc freqs
        Iterator<Object> it = tmpSet.iterator();
        while (it.hasNext()) {
            char[] token = (char[]) it.next();
            MutableValueInt docCount = map.get(token, 0, token.length);
            if (docCount == null) {
                docCount = new MutableValueInt();
                docCount.value = 1;
            } else {
                docCount.value++;
            }
            map.put(token, docCount);
        }
        tmpSet.clear();
    }

    return getResults(fieldName, map, numResults);
}

From source file:reviews.indexing.IndexReviews.java

License:Apache License

private static CharArraySet readStopWords(String filename) {

    CharArraySet stopwords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);

    try {//from  www .  j a v  a  2 s  .co m
        BufferedReader br;
        br = new BufferedReader(new FileReader(new File(filename)));
        String line;
        while ((line = br.readLine()) != null) {
            stopwords.add(line.trim());
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return stopwords;
}

From source file:ri.AnalyzerNuevo.java

@Override
protected TokenStreamComponents createComponents(String string) {

    //To change body of generated methods, choose Tools | Templates.
    final Tokenizer source = new StandardTokenizer();

    Reader reader = new StringReader(string);

    source.setReader(reader);//w  w  w .  ja va2  s .  c  om

    //SynonymMap.Builder builder = new SynonymMap.Builder(true);
    //builder.add(new CharsRef("text"), new CharsRef("documento"), true);

    //SynonymMap synonymMap;

    TokenStream pipeline = source;
    pipeline = new StandardFilter(pipeline);

    pipeline = new EnglishPossessiveFilter(pipeline);
    /*try {
    synonymMap = builder.build();
    pipeline = new SynonymFilter(pipeline,synonymMap,true);
    } catch (IOException ex) {
       Logger.getLogger(AnalyzerNuevo.class.getName()).log(Level.SEVERE, null, ex);
    }*/

    pipeline = new ASCIIFoldingFilter(pipeline);
    pipeline = new LowerCaseFilter(pipeline);
    pipeline = new StopFilter(pipeline, new CharArraySet(stopwords, true));
    pipeline = new PorterStemFilter(pipeline);

    return new TokenStreamComponents(source, pipeline);
}

From source file:tarefa1.IndexFiles.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles"
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
            + "in INDEX_PATH that can be searched with SearchFiles";
    String indexPath = "index";
    //String docsPath = null; -- pega o caminho do main
    boolean create = true;
    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            indexPath = args[i + 1];//from   ww w  .  ja  v  a 2 s.  com
            i++;
        } else if ("-docs".equals(args[i])) {
            docsPath = args[i + 1];
            i++;
        } else if ("-update".equals(args[i])) {
            create = false;
        }
    }

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final Path docDir = Paths.get(docsPath);
    if (!Files.isReadable(docDir)) {
        System.out.println("Document directory '" + docDir.toAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(Paths.get(indexPath));
        //Analyzer analyzer = new StandardAnalyzer();
        Analyzer analyzer;
        if (stoplist) {
            if (stemming) {
                analyzer = new EnglishAnalyzer();
            } else {
                analyzer = new StandardAnalyzer();
            }
        } else {
            String[] stopWordsArray = {};
            CharArraySet stopWords = new CharArraySet(2, true);
            stopWords.addAll(Arrays.asList(stopWordsArray));
            if (stemming) {
                analyzer = new EnglishAnalyzer(stopWords);
            } else {
                analyzer = new StandardAnalyzer(stopWords);
            }
        }

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:tarefa1.SearchFiles.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details.";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
        System.out.println(usage);
        System.exit(0);/* w w w  .  j  a  v  a2s .c o m*/
    }

    String index = "index";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) {
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-paging".equals(args[i])) {
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        }
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer;
    if (stoplist) {
        if (stemming) {
            analyzer = new EnglishAnalyzer();
        } else {
            analyzer = new StandardAnalyzer();
        }
    } else {
        String[] stopWordsArray = {};
        CharArraySet stopWords = new CharArraySet(2, true);
        stopWords.addAll(Arrays.asList(stopWordsArray));
        if (stemming) {
            analyzer = new EnglishAnalyzer(stopWords);
        } else {
            analyzer = new StandardAnalyzer(stopWords);
        }
    }

    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}