Example usage for org.apache.lucene.analysis.hi HindiAnalyzer HindiAnalyzer

List of usage examples for org.apache.lucene.analysis.hi HindiAnalyzer HindiAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.hi HindiAnalyzer HindiAnalyzer.

Prototype

public HindiAnalyzer() 

Source Link

Document

Builds an analyzer with the default stop words: #DEFAULT_STOPWORD_FILE .

Usage

From source file:com.devb.search.IndicIndexer.java

License:Apache License

@Override
public void makeIndex() {
    String indexPath = servletContext.getRealPath("/") + "/hindex/";
    String docsPath = servletContext.getRealPath("/") + "/hdocs/";
    boolean create = true;

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path\n");
        return;/*from  w ww .  jav a2s . com*/
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...\n");

        org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new HindiAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer);

        if (create) {
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        if (docDir.canRead()) {
            if (docDir.isDirectory()) {
                String[] files = docDir.list();
                if (files != null) {
                    for (int i = 0; i < files.length; i++) {
                        File file = new File(docDir, files[i]);
                        FileInputStream fileInputStream = new FileInputStream(file);
                        BufferedReader reader = new BufferedReader(
                                new InputStreamReader(fileInputStream, "UTF-8"));
                        Tokenizer tokenizer = new StandardTokenizer(reader);
                        CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
                        tokenizer.reset();
                        int lineNumber = 0;
                        try {
                            while (tokenizer.incrementToken()) {
                                Document doc = new Document();
                                Field pathField = new StringField("path", file.getName(), Field.Store.YES);
                                doc.add(pathField);
                                TextField nField = new TextField("linenumber",
                                        new Integer(++lineNumber).toString(), Store.YES);
                                doc.add(nField);
                                TextField field = new TextField("contents", termAtt.toString(), Store.YES);
                                doc.add(field);
                                writer.addDocument(doc);
                            }
                            System.out.println("Adding " + file + "\n");
                        } catch (Exception e) {
                            e.printStackTrace();
                        } finally {
                            tokenizer.close();
                            reader.close();
                            fileInputStream.close();
                        }
                    }
                }
            }
        }

        writer.close();

        Date end = new Date();
        System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n");

    } catch (IOException e) {
        System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:com.devb.search.IndicSearcher.java

License:Apache License

private void callSearch(boolean j) {
    System.out.println("Servlet Ctx " + servletContext.getRealPath("/"));
    String indexPath = servletContext.getRealPath("/") + "/hindex/";
    String docsPath = servletContext.getRealPath("/") + "/hdocs/";

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, " + "please check the path\n");
        return;// w  w w  .j  a v  a 2s  .c o m
    }

    IndexReader reader = null;
    IndexSearcher searcher = null;
    Analyzer analyzer = null;
    String field = "contents";

    try {
        reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
        searcher = new IndexSearcher(reader);
        analyzer = new HindiAnalyzer();
    } catch (IOException ioe) {
        ioe.printStackTrace();
    }

    QueryParser parser = new QueryParser(field, analyzer);
    String /*ByteBuffer*/ line = null;
    Query query = null;

    try {
        // line = Charset.forName("UTF-8").encode(this.id);
        line = this.id;

        // line = line.trim();
        if (line == null) {
            return;
        }
        System.out.println("Hindi StandardSearcher / callSearch Line " + line);
        query = parser.parse(line);
        System.out.println("Hindi StandardSearcher / callSearch Hindi Query " + query);
        final int maxHits = 10;
        ScoreDoc[] hits = searcher.search(query, null, maxHits).scoreDocs;
        try {
            // Iterate through the results:
            for (int i = 0; i < hits.length; i++) {
                Document hitDoc = searcher.doc(hits[i].doc);

                if (j) {
                    JSONObject jo = new JSONObject();
                    jo.put("query", query.toString(field));
                    jo.put("path", hitDoc.get("path"));
                    jo.put("line", hitDoc.get("linenumber"));
                    jo.put("contents", hitDoc.get("contents"));

                    ja.put(jo);
                } else {
                    SearchResult ns = new SearchResult();
                    ns.setQuery(query.toString(field));
                    ns.setDocPath(hitDoc.get("path"));
                    ns.setLineNum(hitDoc.get("linenumber"));
                    ns.setContents(hitDoc.get("contents"));
                    contentProvider.put(String.valueOf(i), ns);
                }
            }
        } catch (Exception ito) {
            ito.printStackTrace();
        }

    } catch (Exception ex) {
        ex.printStackTrace();
    }
    try {
        reader.close();
    } catch (IOException ioe) {

    }
}

From source file:com.work.IndexFiles.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {

    // English//from ww  w .  j ava 2s  . co m
    //  String indexPath = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/index/one/";
    //  String docsPath = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/files/";
    //  Analyzer analyzer = new StandardAnalyzer();

    //Hindi
    String indexPath = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/index/one/";
    String docsPath = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/sample/";
    Analyzer analyzer = new HindiAnalyzer();

    //Chinese
    //  Analyzer analyzer = new CJKAnalyzer();

    boolean create = true;

    final Path docDir = Paths.get(docsPath);
    if (!Files.isReadable(docDir)) {
        System.out.println("Document directory '" + docDir.toAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(Paths.get(indexPath));

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:com.work.SearchFiles.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {

    //  String indexPath = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/index/one/";
    //  String queries = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/query/query1.txt";
    //   Analyzer analyzer = new StandardAnalyzer();

    //Hindi//from w  w  w  . ja  va  2s  . c  o  m
    String indexPath = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/index/one/";
    //  String queries = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/query/one.txt";
    String queries = null;
    Analyzer analyzer = new HindiAnalyzer();

    //Chinese
    //  Analyzer analyzer = new CJKAnalyzer();

    String index = indexPath;
    String field = "contents";
    String a = "???";

    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);

    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(field, analyzer);

    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}

From source file:de.mirkosertic.desktopsearch.AnalyzerCache.java

License:Open Source License

public AnalyzerCache(Configuration aConfiguration) {
    standardAnalyzer = configure(new StandardAnalyzer());
    analyzerByLanguage = new HashMap<>();

    registerIfEnabled(SupportedLanguage.ar, aConfiguration, configure(new ArabicAnalyzer()));
    registerIfEnabled(SupportedLanguage.bg, aConfiguration, configure(new BulgarianAnalyzer()));
    registerIfEnabled(SupportedLanguage.br, aConfiguration, configure(new BrazilianAnalyzer()));
    registerIfEnabled(SupportedLanguage.ca, aConfiguration, configure(new CatalanAnalyzer()));
    registerIfEnabled(SupportedLanguage.ckb, aConfiguration, configure(new SoraniAnalyzer()));
    registerIfEnabled(SupportedLanguage.cz, aConfiguration, configure(new CzechAnalyzer()));
    registerIfEnabled(SupportedLanguage.da, aConfiguration, configure(new DanishAnalyzer()));
    registerIfEnabled(SupportedLanguage.de, aConfiguration, configure(new GermanAnalyzer()));
    registerIfEnabled(SupportedLanguage.el, aConfiguration, configure(new GreekAnalyzer()));
    registerIfEnabled(SupportedLanguage.en, aConfiguration, configure(new EnglishAnalyzer()));
    registerIfEnabled(SupportedLanguage.es, aConfiguration, configure(new SpanishAnalyzer()));
    registerIfEnabled(SupportedLanguage.eu, aConfiguration, configure(new BasqueAnalyzer()));
    registerIfEnabled(SupportedLanguage.fa, aConfiguration, configure(new PersianAnalyzer()));
    registerIfEnabled(SupportedLanguage.fi, aConfiguration, configure(new FinnishAnalyzer()));
    registerIfEnabled(SupportedLanguage.fr, aConfiguration, configure(new FrenchAnalyzer()));
    registerIfEnabled(SupportedLanguage.ga, aConfiguration, configure(new IrishAnalyzer()));
    registerIfEnabled(SupportedLanguage.gl, aConfiguration, configure(new GalicianAnalyzer()));
    registerIfEnabled(SupportedLanguage.hi, aConfiguration, configure(new HindiAnalyzer()));
    registerIfEnabled(SupportedLanguage.hu, aConfiguration, configure(new HungarianAnalyzer()));
    registerIfEnabled(SupportedLanguage.hy, aConfiguration, configure(new ArmenianAnalyzer()));
    registerIfEnabled(SupportedLanguage.id, aConfiguration, configure(new IndonesianAnalyzer()));
    registerIfEnabled(SupportedLanguage.it, aConfiguration, configure(new ItalianAnalyzer()));
    registerIfEnabled(SupportedLanguage.lv, aConfiguration, configure(new LatvianAnalyzer()));
    registerIfEnabled(SupportedLanguage.nl, aConfiguration, configure(new DutchAnalyzer()));
    registerIfEnabled(SupportedLanguage.no, aConfiguration, configure(new NorwegianAnalyzer()));
    registerIfEnabled(SupportedLanguage.pt, aConfiguration, configure(new PortugueseAnalyzer()));
    registerIfEnabled(SupportedLanguage.ro, aConfiguration, configure(new RomanianAnalyzer()));
    registerIfEnabled(SupportedLanguage.ru, aConfiguration, configure(new RussianAnalyzer()));
    registerIfEnabled(SupportedLanguage.sv, aConfiguration, configure(new SwedishAnalyzer()));
    registerIfEnabled(SupportedLanguage.th, aConfiguration, configure(new ThaiAnalyzer()));
    registerIfEnabled(SupportedLanguage.tr, aConfiguration, configure(new TurkishAnalyzer()));
}