io.seldon.semvec.CreateLuceneIndexFromDb.java Source code

Introduction

Here is the source code for io.seldon.semvec.CreateLuceneIndexFromDb.java
Source

/*
 * Seldon -- open source prediction engine
 * =======================================
 * Copyright 2011-2015 Seldon Technologies Ltd and Rummble Ltd (http://www.seldon.io/)
 *
 **********************************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at       
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ********************************************************************************************** 
*/
package io.seldon.semvec;

import io.seldon.db.DocumentStore;
import io.seldon.db.SeldonMySQLDocumentStore;
import io.seldon.nlp.AddEntities;
import io.seldon.nlp.TransliteratorPeer;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.HashSet;

import net.htmlparser.jericho.Source;

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

import com.sampullara.cli.Args;
import com.sampullara.cli.Argument;

/**
 * Create a lucene index in a form usable by Semantic Vectors.
 * At present utilizes the Seldon DB structure for JDBC databases as only option.
 * @author clive
 *
 */
public class CreateLuceneIndexFromDb {

    @Argument(alias = "l", description = "Lucene index directory", required = true)
    private String luceneDir;

    @Argument(alias = "j", description = "Seldon JDBC", required = true)
    private String jdbc;

    @Argument(alias = "it", description = "limit to items of this type", required = false)
    private Integer itemType = 1;

    @Argument(alias = "use-item-attrs", description = "use item attributes", required = false)
    private Boolean useItemAttrs;

    @Argument(alias = "use-comments", description = "use comments", required = false)
    private Boolean useComments;

    @Argument(alias = "use-users", description = "use users", required = false)
    private Boolean useUsers;

    @Argument(alias = "use-user-actions", description = "use users actions", required = false)
    private Boolean useUserActions;

    @Argument(alias = "use-dim", description = "use dimension", required = false)
    private Boolean useDim;

    @Argument(alias = "raw-ids", description = "whether to use the raw ids from db as the ids to put into lucene", required = false)
    Boolean rawIds;

    @Argument(alias = "attr-names", description = "the attribute names from the db", required = false)
    String[] attrNames;

    @Argument(description = "whether to recreate the lucene index", required = false)
    Boolean recreate = false;

    @Argument(description = "print extra debug", required = false)
    boolean debug = false;

    @Argument(alias = "item-limit", description = "limit items from db, -1 no limit (default)", required = false)
    Integer itemLimit = -1;

    @Argument(alias = "delta-mins", description = "delta mins in past to get items", required = false)
    Integer deltaMins = 0;

    @Argument(alias = "delta-days", description = "delta days in past to get items", required = false)
    Integer deltaDays = 0;

    @Argument(alias = "min-tokens", description = "minimum number of tokens in document for it to be added", required = false)
    Integer minTokens = 0;

    @Argument(alias = "use-item-map-datetime", description = "use date field in item_map_datetime db table", required = false)
    boolean useItemMapDatetime = false;

    @Argument(alias = "append-only", description = "append new items to the lucene index and don't add existing entries", required = false)
    boolean appendOnly = false;

    @Argument(alias = "use-item-ids", description = "use item ids or names", required = false)
    boolean useItemIds = false;

    @Argument(alias = "remove-html", description = "remove html from text", required = false)
    boolean removeHtml = false;

    @Argument(alias = "positional-index", description = "create a positional index", required = false)
    boolean positionalIndex = false;

    @Argument(alias = "text-attr-ids", description = "attribute ids for data in item_map_text (deprecated)", required = false)
    Integer[] textAttrIds;

    @Argument(alias = "attr-ids", description = "attribute ids for data in item_map_varchar (deprecated)", required = false)
    Integer[] attrIds;

    @Argument(alias = "nlp-attr-ids", description = "attribute ids for data in item_map_varchar (deprecated)", required = false)
    Integer[] nlpAttrIds;

    @Argument(alias = "filter-attr-enum", description = "filter by this attr_id:value_id from table item_map_enum", required = false)
    String filterAttrEnumId;

    @Argument(alias = "extract-persons", description = "exteract people entities from text", required = false)
    boolean extractPersons = false;

    @Argument(alias = "extract-organisations", description = "extract organisations from text", required = false)
    boolean extractOrganisations = false;

    @Argument(alias = "extract-places", description = "extract places from text", required = false)
    boolean extractPlaces = false;

    @Argument(alias = "extract-nouns", description = "extract nouns from text", required = false)
    boolean extractNouns = false;

    @Argument(alias = "use-stop-words", description = "use stop words", required = false)
    boolean useStopwords = false;

    @Argument(alias = "stop-words-file", description = "stop words file", required = false)
    String stopWordsFile = null;

    @Argument(alias = "concepts-file", description = "concepts file", required = false)
    String conceptsFile = null;

    @Argument(alias = "open-nlp-location", description = "location of open nlp files", required = false)
    String nlpLocation = null;

    @Argument(alias = "transliterate", description = "transliterate text to remove accents and punctuation", required = false)
    boolean transLiterate = false;

    @Argument(alias = "sequential-ids", description = "ensure sequential ids in lucene", required = false)
    boolean sequentialIds = false;

    @Argument(alias = "store-term-vectors", description = "store term vectors", required = false)
    boolean storeTermVectors = false;

    @Argument(alias = "yahoo-lda-file", description = "output for yahoo LDA", required = false)
    String yahooLDAfile = null;

    @Argument(alias = "client-item-pattern", description = "item pattern to limit items", required = false)
    String clientItemPattern = null;

    @Argument(alias = "output-file", description = "output text to file for each document", required = false)
    String outFile = null;

    private EXTRACTION_TYPE extractionMethod;

    public static enum EXTRACTION_TYPE {
        COMMENTS, ITEM_ATTR, USERS, USER_DIM, USER_ACTIONS;
    };

    static final String FIELD_PATH = "path";
    DocumentStore docStore = null;
    AddEntities addEntities = null;

    long seqId = 0;

    public void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
        docStore = new SeldonMySQLDocumentStore(jdbc);
        File luceneFile = new File(luceneDir);
        //IndexWriter writer = new IndexWriter(FSDirectory.open(luceneFile), new StandardAnalyzer(Version.LUCENE_CURRENT), recreate, IndexWriter.MaxFieldLength.LIMITED);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,
                new WhitespaceAnalyzer(Version.LUCENE_CURRENT));

        IndexWriter writer = new IndexWriter(FSDirectory.open(luceneFile), config);
        IndexSearcher reader = null;
        DirectoryReader ireader = null;
        if (!recreate) {
            ireader = DirectoryReader.open(FSDirectory.open(luceneFile));
            reader = new IndexSearcher(ireader);
        }

        BufferedWriter fileWriter = null;
        if (yahooLDAfile != null)
            fileWriter = new BufferedWriter(new FileWriter(yahooLDAfile));
        else if (outFile != null)
            fileWriter = new BufferedWriter(new FileWriter(outFile));
        updateComments(reader, writer, itemType, recreate, fileWriter);

        if (fileWriter != null)
            fileWriter.close();
        if (ireader != null)
            ireader.close();
        writer.close();
    }

    private Document createDoc(String docPath, String val) {
        Document doc = new Document();
        //FieldType ft = new FieldType(StringField.TYPE_STORED);
        //ft.setOmitNorms(false);
        //new Field("field", "value", ft);
        doc.add(new Field(FIELD_PATH, docPath, Field.Store.YES, Field.Index.NOT_ANALYZED));
        if (positionalIndex)
            doc.add(new Field("contents", val, Field.Store.NO, Field.Index.ANALYZED,
                    Field.TermVector.WITH_POSITIONS));
        else
            doc.add(new Field("contents", val, Field.Store.NO, Field.Index.ANALYZED,
                    storeTermVectors ? Field.TermVector.YES : Field.TermVector.NO));
        return doc;
    }

    private String toSV(Long id) {
        String idStr = "" + id;
        if (idStr.length() < 5)
            return "docs/0000/" + id;
        else {
            String[] struc = new String[2];
            struc[0] = idStr.substring(0, 4);
            struc[1] = idStr.substring(4);
            String docName = "docs/" + struc[0] + "/" + struc[1];
            return docName;
        }
    }

    private void saveDocument(long id, IndexSearcher reader, IndexWriter writer, BufferedWriter fileWriter)
            throws CorruptIndexException, IOException {
        String path;
        if (rawIds)
            path = "" + id;
        else
            path = sequentialIds ? toSV(this.seqId++) : toSV(id);
        if (reader != null && appendOnly) {
            Term docPathTerm = new Term(FIELD_PATH, path);
            TermQuery tq = new TermQuery(docPathTerm);
            int hits = reader.search(tq, 1).totalHits;
            if (hits > 0) {
                if (debug)
                    System.out.println("Skipping existing doc with id " + id);
                return; // document exists so don't do anything
            }
        }
        String comments = null;
        String nlpComments = null;
        switch (this.extractionMethod) {
        case COMMENTS:
            comments = docStore.getComments(id);
            break;
        case ITEM_ATTR:
            if (attrIds != null && attrIds.length > 0)
                comments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(attrIds)));
            else if (attrNames != null && attrNames.length > 0)
                comments = docStore.getItemTextualByName(id, new HashSet<String>(Arrays.asList(attrNames)));
            else
                comments = docStore.getItemTextual(id);
            if (nlpAttrIds != null && nlpAttrIds.length > 0)
                nlpComments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(nlpAttrIds)));
            break;
        case USERS:
            comments = docStore.getUserItems(id, useItemIds);
            break;
        case USER_ACTIONS:
            comments = docStore.getUserActionAttrs(id, new HashSet<Integer>(Arrays.asList(attrIds)));
            break;
        case USER_DIM:
            comments = docStore.getDimTextual(id, new HashSet<Integer>(Arrays.asList(textAttrIds)), itemLimit);
            break;
        }

        if (comments != null) {
            if (this.removeHtml) {
                System.out.println("removing html");
                Source source = new Source(comments);
                comments = source.getTextExtractor().toString();

                if (nlpComments != null) {
                    source = new Source(comments);
                    nlpComments = source.getTextExtractor().toString();
                }
            }

            comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " ");
            if (transLiterate) {
                System.out.println("removing punctuation");
                comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments);
            }

            if (addEntities != null && nlpComments == null)
                comments = addEntities.process(comments);
            else if (addEntities != null && nlpComments != null) {
                nlpComments = addEntities.process(nlpComments);
                //System.out.println("NLP Comments:["+nlpComments+"]");
                //System.out.println("Existing comments:"+comments);
                comments = comments + " " + nlpComments;
            } else {
                comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " ");
                if (transLiterate) {
                    System.out.println("removing punctuation");
                    comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments);
                }
            }
            comments = comments.replaceAll("\\|", "");
            comments = comments.trim();
            String[] tokens = comments.split(" ");
            if (!"".equals(comments) && tokens.length >= minTokens) {
                if (debug)
                    System.out.println("adding document for id " + id + " with text:[" + comments + "]");
                if (reader != null) {
                    Term docPathTerm = new Term(FIELD_PATH, path);
                    TermQuery tq = new TermQuery(docPathTerm);
                    int hits = reader.search(tq, 1).totalHits;
                    if (hits > 0) // doc exists in index (assumes a unique match...)
                        writer.updateDocument(docPathTerm, createDoc(path, comments));
                    else
                        writer.addDocument(createDoc(path, comments));
                } else
                    writer.addDocument(createDoc(path, comments));

                if (fileWriter != null) {
                    if (yahooLDAfile != null) {
                        fileWriter.write("" + id);
                        fileWriter.write(" ");
                        fileWriter.write(path);
                        fileWriter.write(" ");
                        fileWriter.write(comments);
                        fileWriter.write("\n");
                    } else {
                        fileWriter.write("" + id);
                        fileWriter.write(",");
                        fileWriter.write(comments);
                        fileWriter.write("\n");
                    }
                }
            } else
                System.out.println("Skipping document with id " + id + " of token length " + tokens.length);
        }
    }

    private void updateComments(IndexSearcher reader, IndexWriter writer, int itemType, boolean recreate,
            BufferedWriter yahooWriter) throws CorruptIndexException, IOException {
        Calendar cal = Calendar.getInstance();
        if (deltaMins > 0)
            cal.add(Calendar.MINUTE, deltaMins * -1);
        ArrayList<Long> ids = null;
        switch (this.extractionMethod) {
        case COMMENTS:
            ids = docStore.getLatestComments(itemType, deltaMins > 0 ? cal.getTime() : null);
            break;
        case ITEM_ATTR:
            ids = docStore.getLatestItems(itemType, deltaMins > 0 ? cal.getTime() : null, itemLimit,
                    clientItemPattern, useItemMapDatetime, filterAttrEnumId);
            break;
        case USERS:
        case USER_ACTIONS:
            ids = docStore.getLatestUsers(deltaMins > 0 ? cal.getTime() : null);
            break;
        case USER_DIM:
            ids = docStore.getUserDim(new HashSet<Integer>(Arrays.asList(attrIds)));
            break;
        }

        if (ids != null) {
            System.out.println("Found " + ids.size() + " new items with comments");
            int count = 0;
            for (Long id : ids) {
                System.out.println("Processing " + (++count) + "/" + ids.size());
                saveDocument(id, reader, writer, yahooWriter);
            }
        }
    }

    public boolean config(String[] args) throws IOException {
        if (luceneDir == null && docStore != null) {
            System.out.println("bad args");
            return false;
        }

        if (useItemAttrs)
            this.extractionMethod = EXTRACTION_TYPE.ITEM_ATTR;
        else if (useComments)
            this.extractionMethod = EXTRACTION_TYPE.COMMENTS;
        else if (useUsers)
            this.extractionMethod = EXTRACTION_TYPE.USERS;
        else if (useUserActions)
            this.extractionMethod = EXTRACTION_TYPE.USER_ACTIONS;
        else if (useDim)
            this.extractionMethod = EXTRACTION_TYPE.USER_DIM;

        if (deltaDays > 0)
            deltaMins = deltaMins + (deltaDays * 24 * 60);

        if (extractionMethod == null) {
            System.out.println("must supply either -use-item-attrs or -use-comments or -use-users");
            return false;
        }

        if (nlpLocation != null) {
            addEntities = new AddEntities(nlpLocation + "/en-sent.bin", nlpLocation + "/en-token.bin",
                    extractPersons ? nlpLocation + "/en-ner-person.bin" : null,
                    extractOrganisations ? nlpLocation + "/en-ner-organization.bin" : null,
                    extractPlaces ? nlpLocation + "/en-ner-location.bin" : null,
                    useStopwords ? (stopWordsFile == null ? nlpLocation + "/stopwords.txt" : stopWordsFile) : null,
                    extractNouns ? nlpLocation + "/en-pos-maxent.bin" : null,
                    conceptsFile != null ? conceptsFile : null);
        } else if (useStopwords || conceptsFile != null) {
            addEntities = new AddEntities(stopWordsFile == null ? nlpLocation + "/stopwords.txt" : stopWordsFile,
                    conceptsFile);
        }

        return true;
    }

    public static void main(String[] args) throws IOException {
        CreateLuceneIndexFromDb cr = new CreateLuceneIndexFromDb();
        try {
            Args.parse(cr, args);
            if (cr.config(args)) {
                cr.createIndex();
            }
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
            Args.usage(cr);
        }
    }

}