com.mathworks.xzheng.analysis.nutch.NutchExample.java Source code

Introduction

Here is the source code for com.mathworks.xzheng.analysis.nutch.NutchExample.java
Source

package com.mathworks.xzheng.analysis.nutch;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
*/

import java.io.IOException;
import java.io.StringReader;

import javax.security.auth.login.Configuration;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.Query;

// From chapter 4
public class NutchExample {

    public static void main(String[] args) throws IOException {
        Configuration conf = Configuration.getConfiguration();
        conf.addResource("nutch-default.xml");
        NutchDocumentAnalyzer analyzer = new NutchDocumentAnalyzer(conf); //1

        TokenStream ts = analyzer.tokenStream("content", new StringReader("The quick brown fox..."));
        int position = 0;
        Token token;
        while (ts.incrementToken()) { // 2
            token = ts.getAttribute(org.apache.lucene.analysis.Token.class);
            if (token == null) {
                break;
            }
            int increment = token.getPositionIncrement();

            if (increment > 0) {
                position = position + increment;
                System.out.println();
                System.out.print(position + ": ");
            }

            System.out.print("[" + token.termBuffer().toString() + ":" + token.startOffset() + "->"
                    + token.endOffset() + ":" + token.type() + "] ");
        }
        System.out.println();

        Query nutchQuery = Query.parse("\"the quick brown\"", conf); // 3
        org.apache.lucene.search.Query luceneQuery;
        luceneQuery = new QueryFilters(conf).filter(nutchQuery); // A
        System.out.println("Translated: " + luceneQuery);
    }
}

/*
#1 Custom analyzer
#2 Display token details
#3 Parse to Nutch's Query
#A Create corresponding translated Lucene Query
*/