Example usage for org.apache.lucene.analysis.standard StandardAnalyzer StandardAnalyzer

List of usage examples for org.apache.lucene.analysis.standard StandardAnalyzer StandardAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.standard StandardAnalyzer StandardAnalyzer.

Prototype

public StandardAnalyzer(Reader stopwords) throws IOException 

Source Link

Document

Builds an analyzer with the stop words from the given reader.

Usage

From source file:WriteIndex.java

License:Apache License

/**
 * @param args/*from   ww  w .  j av  a2  s .c o  m*/
 */
public static void main(String[] args) throws IOException {

    File docs = new File("documents");
    File indexDir = new File(INDEX_DIRECTORY);

    Directory directory = FSDirectory.open(indexDir);

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer);
    IndexWriter writer = new IndexWriter(directory, conf);
    writer.deleteAll();

    for (File file : docs.listFiles()) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        Parser parser = new AutoDetectParser();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, handler, metadata, context);
        } catch (TikaException e) {
            e.printStackTrace();
        } catch (SAXException e) {
            e.printStackTrace();
        } finally {
            stream.close();
        }

        String text = handler.toString();
        String fileName = file.getName();

        Document doc = new Document();
        doc.add(new Field("file", fileName, Store.YES, Index.NO));

        for (String key : metadata.names()) {
            String name = key.toLowerCase();
            String value = metadata.get(key);

            if (StringUtils.isBlank(value)) {
                continue;
            }

            if ("keywords".equalsIgnoreCase(key)) {
                for (String keyword : value.split(",?(\\s+)")) {
                    doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED));
                }
            } else if ("title".equalsIgnoreCase(key)) {
                doc.add(new Field(name, value, Store.YES, Index.ANALYZED));
            } else {
                doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED));
            }
        }
        doc.add(new Field("text", text, Store.NO, Index.ANALYZED));
        writer.addDocument(doc);

    }

    writer.commit();
    writer.deleteUnusedFiles();

    System.out.println(writer.maxDoc() + " documents written");
}

From source file:MakeLuceneIndexPreprocessed.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";

    String inputLuceneIndexName = "2013-06-18-lucene-gab";
    String luceneIndexName = "2013-06-18-lucene-gab-standard";

    System.currentTimeMillis();// www  .  j  a v a  2  s  . co  m

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-inputluceneindex"))
            inputLuceneIndexName = args[++i];

        if (args[i].equals("-outputluceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

    }
    String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt";
    String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt";
    String logPath = baseDir + inputLuceneIndexName + ".log";

    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();
    logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0");

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        //         Analyzer analyzer = new WikipediaAnalyzer();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);
        //         iwc.setSimilarity(new ESASimilarity());

        IndexWriter writer = new IndexWriter(dir, iwc);

        int iArticleCount = 0;
        int iSkippedPageCount = 0;

        BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath));
        BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath));
        String lineText = rawTextReader.readLine();
        String lineLinks = artikelInLinksReader.readLine();

        while (lineText != null) {
            //            String title = lineText.substring(0, lineText.indexOf("\t")); 
            //            while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){
            //               lineLinks = artikelInLinksReader.readLine();
            //            }
            int endOfTitle = lineText.indexOf("\t");
            String title = lineText.substring(0, endOfTitle);

            if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) {
                ++iArticleCount;
                Document doc = new Document();
                doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                        + lineText.substring(endOfTitle + 1), Field.Store.NO));
                //               System.out.println(title + " " + 
                //               title + " " + 
                //               title + " " + 
                //               title + " " +
                //               lineText.substring(endOfTitle+1));
                writer.addDocument(doc);

                if (iArticleCount % 1000 == 0) {
                    writer.commit();
                    logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount
                            + " iSkippedPageCount: " + iSkippedPageCount);
                    logger.flush();
                }
            }
            lineText = rawTextReader.readLine();
            lineLinks = artikelInLinksReader.readLine();
        }
        rawTextReader.close();
        artikelInLinksReader.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:DocIndexer.java

License:Apache License

private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException {
    RAMDirectory directory = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET));
    config.setOpenMode(OpenMode.CREATE);
    config.setCommitOnClose(true);/*from  w ww.  j  a va 2  s.com*/
    try (IndexWriter iwriter = new IndexWriter(directory, config)) {
        for (String inputFile : inputFiles) {
            File file = new File(inputFile);
            if (file.length() == 0) {
                continue;
            }

            String title;
            try (BufferedReader titleReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(file), "UTF-8"))) {
                title = titleReader.readLine();
                if (title != null && title.startsWith("[[")) {
                    // Generally the first line of the txt is the title. In a few cases the
                    // first line is a "[[tag]]" and the second line is the title.
                    title = titleReader.readLine();
                }
            }
            Matcher matcher = SECTION_HEADER.matcher(title);
            if (matcher.matches()) {
                title = matcher.group(1);
            }

            String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt);
            try (FileReader reader = new FileReader(file)) {
                Document doc = new Document();
                doc.add(new TextField(Constants.DOC_FIELD, reader));
                doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES));
                doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES));
                iwriter.addDocument(doc);
            }
        }
    }
    return directory;
}

From source file:QueryLuceneIndex.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\t[-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details.";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
        System.out.println(usage);
        System.exit(0);/*from   w  w  w .  j a  v a2 s . co m*/
    }

    String index = "/home/chrisschaefer/enwiki-20130604-lucene_bkp";
    String field = "contents";
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) {
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-paging".equals(args[i])) {
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        }
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    QueryParser parser = new QueryParser(Version.LUCENE_43, field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }
        //Query query = new TermQuery(new Term("title", line));
        Query query = parser.parse(line);
        //System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, null, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}

From source file:LuceneIndexDirectoryOrFile.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles"
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
            + "in INDEX_PATH that can be searched with SearchFiles";
    //String indexPath = "index";
    //String docsPath = null;

    String indexPath = indexDir;/*from  w w w  .j a  va  2s.com*/
    String docsPath = sourceDir;

    boolean create = true;

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:ClassifierHD.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//from w w  w  . ja v a2 s.  co m
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        Class.forName("org.postgresql.Driver");
        conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres",
                "dbwpsdkdl");
        conn.setAutoCommit(false);
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
                continue;
            }
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');
                }
                lv_cnt++;
            }
            br.close();

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }

            ts.end();
            ts.close();

            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
            }
            //System.out.println(message);
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            pstmt.addBatch();
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
        }
        pstmt.executeBatch();
        //pstmt.clearParameters();
        pstmt.close();
        conn.commit();
        conn.close();
        bw.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
    analyzer.close();
}

From source file:PostgresClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//from   w ww. j a va2  s . co m
                "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection c = null;
    Statement stmt = null;
    Statement stmtU = null;
    try {
        Class.forName("org.postgresql.Driver");
        c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl");
        c.setAutoCommit(false);
        System.out.println("Opened database successfully");
        stmt = c.createStatement();
        stmtU = c.createStatement();
        ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null");

        while (rs.next()) {
            String seq = rs.getString("seq");
            //String rep = rs.getString("rep");
            String body = rs.getString("body");
            //String category = rep;
            String id = seq;
            String message = body;

            //System.out.println("Doc: " + id + "\t" + message);

            Multiset<String> words = ConcurrentHashMultiset.create();

            // extract words from tweet
            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    // if the word is not in the dictionary, skip it
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }
            // Mark : Modified 
            ts.end();
            ts.close();

            // create vector wordId => weight using tfidf
            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            // With the classifier, we get one score for each label 
            // The label with the highest score is the one the tweet is more likely to
            // be associated to
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
                //System.out.print("  " + labels.get(categoryId) + ": " + score);
            }
            //System.out.println(" => " + labels.get(bestCategoryId));
            //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id );
            stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId)
                    + "' WHERE seq = " + id);
        }
        rs.close();
        stmt.close();
        stmtU.close();
        c.commit();
        c.close();
        analyzer.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
}

From source file:NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text.
 * Can handle ngrams of any length and also perform stop word removal before extraction
 * @param text the text that the ngrams should be extracted from
 * @param length the length of the ngrams
 * @param stopWords whether or not stopwords should be removed before extraction
 * @param overlap whether or not the ngrams should overlap
 *///from w  w  w .  j  a va 2 s . c  om
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /* If the minLength and maxLength are both 1, then we want unigrams
     * Make use of a StopAnalyzer when stopwords should be removed
     * Make use of a SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            analyzer = new SimpleAnalyzer(Version.LUCENE_36);
        }
    } else { //Bigger than unigrams so use ShingleAnalyzerWrapper. Once again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(Version.LUCENE_24), length, length, " ",
                    false, false); //This is a hack to use Lucene 2.4 since in 2.4 position increments weren't preserved by default. Using a later version puts underscores (_) in the place of removed stop words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(Version.LUCENE_36), length, length, " ",
                    false, false);
        }
    }

    //Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    int tokenCount = 0;
    while (tokenStream.incrementToken()) {

        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); //The actual token term
        nGrams.add(termToken); //Add all ngrams to the ngram LinkedList

        //If n-grams are not allowed to overlap, then increment to point of no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    //Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:syslogUnityBackend.java

License:Open Source License

public static void main(String[] args) {

    final File INDEX_DIR = new File("/var/lib/syslogUnity/index");
    final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    final IndexWriter writer;

    try {/*from www  .  j a va2 s  .  c o  m*/
        writer = new IndexWriter(FSDirectory.open(INDEX_DIR), analyzer, IndexWriter.MaxFieldLength.LIMITED);
    } catch (IOException ex) {
        System.out.print("IOException: " + ex + "\n");
        return;
    }
    writer.setRAMBufferSizeMB(8);

    BlockingQueue<org.w3c.dom.Document> q = new LinkedBlockingQueue<org.w3c.dom.Document>();

    syslogReceive logServer = new syslogReceive(q);
    syslogProcess logStore1 = new syslogProcess(q, writer);
    syslogProcess logStore2 = new syslogProcess(q, writer);
    syslogProcess logStore3 = new syslogProcess(q, writer);
    syslogProcess logStore4 = new syslogProcess(q, writer);
    syslogProcess logStore5 = new syslogProcess(q, writer);
    searchServer logSearch = new searchServer(writer, analyzer);

    new Thread(logServer).start();
    new Thread(logStore1).start();
    new Thread(logStore2).start();
    new Thread(logStore3).start();
    new Thread(logStore4).start();
    new Thread(logStore5).start();
    new Thread(logSearch).start();

    Runtime.getRuntime().addShutdownHook(new Thread() {
        public void run() {
            try {
                loopControl.test = false;
                System.out.print("Exiting gracefully...\n");
            } catch (Exception ex) {
                System.out.print("Exception:" + ex + "\n");
            }
        }
    });
}

From source file:SimpleLuceneSearch.java

License:Apache License

public SimpleLuceneSearch(String index) throws CorruptIndexException, IOException {
    this.INDEX = index;// "/home/coulet/workspace/data_resource/mesh/indexOnMesh";
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(INDEX)), true);
    SEARCHER = new IndexSearcher(reader);
    ANALYSER = new StandardAnalyzer(Version.LUCENE_35);
}