com.jamespot.glifpix.index.ResourceDocument.java Source code

Introduction

Here is the source code for com.jamespot.glifpix.index.ResourceDocument.java
Source

package com.jamespot.glifpix.index;

/* ----------------------------------------------------------------------------------
    
 This file is part of GlifPix Tags Extractor.
    
 GlifPix Tags Extractor is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
    
 GlifPix Tags Extractor is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
    
 You should have received a copy of the GNU General Public License
 along with GlifPix Tags Extractor.  If not, see <http://www.gnu.org/licenses/>.
    
 Contact : paul<at>jamespot<dot>com
    
 ---------------------------------------------------------------------------------- */

import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.security.NoSuchAlgorithmException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StaleReaderException;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.LockObtainFailedException;

import com.jamespot.glifpix.library.ContentAnalyzer;
import com.jamespot.glifpix.resources.ResourcesHandler;
import com.jamespot.glifpix.util.Utils;

public class ResourceDocument extends StoredDocument {

    static public Logger logger = Logger.getLogger(ResourceDocument.class);
    int _resourceLength;

    private ResourceDocument() {
        super();
    }

    public static ResourceDocument create(String name, String literal) throws IOException {
        ResourceDocument toRet = new ResourceDocument();
        toRet.addNameField(name);
        toRet.addLiteralField(literal);
        return toRet;
    }

    private void addNameField(String name) {
        // _luceneDocument.add(new Field("name", name, Store.YES,
        // Index.NOT_ANALYZED_NO_NORMS));
    }

    private void addLiteralField(String literal) throws IOException {
        _luceneDocument
                .add(new Field("literal", replaceUnicodeStr(literal), Store.YES, Index.NOT_ANALYZED_NO_NORMS));

        String coolLiteral = literal.replaceAll("\\\"", "");
        coolLiteral = replaceUnicodeStr(coolLiteral);

        Analyzer resAnalyzer = new ContentAnalyzer();
        TokenStream ts = resAnalyzer.tokenStream("dummyField", new StringReader(coolLiteral));

        TermAttribute termAttribute = ts.addAttribute(TermAttribute.class);

        int length = 0;
        StringBuffer sb = new StringBuffer();
        while (ts.incrementToken()) {
            sb.append("_" + termAttribute.term());
            length++;
        }
        sb.insert(0, length);
        _resourceLength = length;
        ts.end();
        ts.close();

        String finalToken = sb.toString();
        _luceneDocument.add(new Field("token", finalToken, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
        _luceneDocument.add(new Field("crc", Utils.getCRC(finalToken), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
    }

    protected static Set<Long> getCRCs(IndexReader ir) throws IOException {
        logger.info("Loading CRCs");
        int nbDocs = ir.numDocs();
        float loadFactor = .75f;
        int initCapacity = ((int) (nbDocs / loadFactor)) + 1000;

        Set<Long> toRet = new HashSet<Long>(initCapacity, loadFactor);
        int nbCrc = 0;
        TermEnum te = ir.terms(new Term("crc"));

        for (boolean hasNext = true; hasNext; hasNext = te.next()) {
            nbCrc++;
            if (nbCrc % 100000 == 0) {
                logger.info(nbCrc + " CRCs loaded");
            }
            Term t = te.term();
            if (!t.field().equalsIgnoreCase("crc"))
                break;
            Long crc = Long.parseLong(t.text());
            toRet.add(crc);
        }
        logger.info("Total CRCs loaded :" + nbCrc);
        return toRet;
    }

    public int getResourceLength() {
        return _resourceLength;
    }

    public static String getLiteral(IndexReader ir, String token) throws IOException {
        TermEnum te = ir.terms(new Term("token", token));
        if (te.term().field().equalsIgnoreCase("token")) {
            TermDocs td = ir.termDocs(te.term());
            if (td.next()) {
                int idDoc = td.doc();
                Document doc = ir.document(idDoc);
                if (doc.get("token").equals(token)) {
                    return doc.get("literal");
                }
            }
        }
        return null;
    }

    public static void deleteResource(IndexReader ir, String token) throws IOException {
        ir.deleteDocuments(new Term("token", token));
    }
}