lucenesearch.NGram.java Source code

Introduction

Here is the source code for lucenesearch.NGram.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package lucenesearch;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import lucenesearch.LuceneTools.ExtendedDocument;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

/**
 *
 * @author arashdn
 */
public class NGram {

    private HashMap<Integer, ArrayList<String>> tags;
    private String mainTag;

    public NGram(String mainTag) throws FileNotFoundException, IOException {
        init(mainTag);
    }

    private void init(String mainTag) throws FileNotFoundException, IOException {
        this.mainTag = mainTag;
        tags = new HashMap<>();
        BufferedReader br = new BufferedReader(new FileReader("./data/" + mainTag + "_a_tag.txt"));
        String line = "";
        while ((line = br.readLine()) != null) {
            String[] tmp = line.split(",");
            int aid = Integer.parseInt(tmp[0]);
            ArrayList<String> tg = tags.get(aid);
            if (tg == null) {
                tg = new ArrayList<>();
                tags.put(aid, tg);
            }
            //if(tmp[1] != null && tmp[1] != "" && !tmp[1].equalsIgnoreCase("java"))
            tg.add(tmp[1]);
        }
    }

    public void getNGram(int n) throws IOException, ParseException {
        getNGram(n, 10000000);
    }

    public void getNGram(int n, int hitPP) throws IOException, ParseException {
        int hitsPerPage = hitPP;

        String index = new Searcher().getPostIndexPath();
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
        IndexSearcher searcher = new IndexSearcher(reader);
        BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();

        //booleanQuery.add(new QueryParser("Body", analyzer).parse(""), BooleanClause.Occur.MUST);
        booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST);

        TopDocs results;

        results = searcher.search(booleanQuery.build(), hitsPerPage);

        ScoreDoc[] hits = results.scoreDocs;

        int numTotalHits = results.totalHits;
        System.out.println(numTotalHits + " total matching documents");

        int start = 0;
        int end = Math.min(numTotalHits, hitsPerPage);

        PrintWriter pw = new PrintWriter("./data/grams/" + n + "gram.csv");

        StringBuilder sb = new StringBuilder();
        for (int i = start; i < end; i++) {
            Document doc = searcher.doc(hits[i].doc);
            ArrayList<String[]> tmp = getNGrams(doc, new ExtendedDocument(hits[i].doc, reader), n);
            for (String[] ngrams : tmp) {
                sb = new StringBuilder();
                sb.append(doc.get("SId"));
                sb.append(",");
                sb.append(toTabbedStr(ngrams));
                sb.append(",");
                ArrayList<String> tagg = tags.get(Integer.parseInt(doc.get("SId")));
                sb.append(implodeTabbed(tagg));
                sb.append("\n");
                if (tagg.size() > 1)
                    pw.print(sb.toString());
            }

        }
        pw.close();
    }

    private String toTabbedStr(String[] ngrams) {
        StringBuilder sb = new StringBuilder();
        int n = ngrams.length;
        for (int i = 0; i < n; i++) {
            sb.append(ngrams[i]);
            sb.append("\t");
        }
        return sb.toString().trim();
    }

    private ArrayList<String[]> getNGrams(Document originalDoc, ExtendedDocument doc, int n) throws IOException {
        if (n == 1) {
            return get1Gram(originalDoc, doc);
        } else if (n == 2) {
            return get2Gram(doc);
        } else if (n == 4) {
            return get4Gram(doc);
        } else
            return null;
    }

    private ArrayList<String[]> get1Gram(Document originalDoc, ExtendedDocument doc) throws IOException {
        ArrayList<String[]> res = new ArrayList<>();

        //        Terms t = doc.getTermVector("Body");
        //        TermsEnum itr = t.iterator();
        //        BytesRef term;
        //        while ((term = itr.next()) != null)
        //        {
        //            String termText = term.utf8ToString();
        //            String[] tmp= new String[1];
        //            tmp[0] = termText;
        //            res.add(tmp);
        //        }

        ArrayList<String> t = new lucenesearch.LuceneTools.LuceneUtils(null)
                .getAnalyzedRemoveHtml(originalDoc.get("Body"));
        for (String termText : t) {
            String[] tmp = new String[1];
            tmp[0] = termText;
            res.add(tmp);
        }

        return res;
    }

    private ArrayList<String[]> get2Gram(ExtendedDocument doc) throws IOException {
        ArrayList<String[]> res = new ArrayList<>();

        Terms t = doc.getTermVector("Body");
        TermsEnum itr = t.iterator();
        BytesRef term;
        ArrayList<String> terms = new ArrayList<>();
        while ((term = itr.next()) != null) {
            String termText = term.utf8ToString();
            terms.add(termText);
        }

        int n = terms.size();

        for (int i = 1; i < n; i++) {
            String[] temp = new String[2];
            temp[0] = terms.get(i - 1);
            temp[1] = terms.get(i);
            res.add(temp);
        }

        return res;
    }

    private ArrayList<String[]> get4Gram(ExtendedDocument doc) throws IOException {
        ArrayList<String[]> res = new ArrayList<>();

        Terms t = doc.getTermVector("Body");
        TermsEnum itr = t.iterator();
        BytesRef term;
        ArrayList<String> terms = new ArrayList<>();
        while ((term = itr.next()) != null) {
            String termText = term.utf8ToString();
            terms.add(termText);
        }

        int n = terms.size();

        for (int i = 3; i < n; i++) {
            String[] temp = new String[4];
            temp[0] = terms.get(i - 3);
            temp[1] = terms.get(i - 2);
            temp[2] = terms.get(i - 1);
            temp[3] = terms.get(i);
            res.add(temp);
        }

        return res;
    }

    private String implodeTabbed(ArrayList<String> tags) {
        StringBuilder sb = new StringBuilder();
        for (String tag : tags) {
            if (!tag.equalsIgnoreCase(mainTag))
                sb.append(tag).append("\t");
        }
        String res = sb.toString();
        if (res.length() == 0)
            res = "#";
        return res.substring(0, res.length() - 1);
    }

}