it.cnr.isti.hpc.dexter.spot.DocumentFrequencyGenerator.java Source code

Java tutorial

Introduction

Here is the source code for it.cnr.isti.hpc.dexter.spot.DocumentFrequencyGenerator.java

Source

/**
 *  Copyright 2014 Diego Ceccarelli
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package it.cnr.isti.hpc.dexter.spot;

import it.cnr.isti.hpc.benchmark.Stopwatch;
import it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer;
import it.cnr.isti.hpc.dexter.analysis.DexterAnalyzer.ArticleIterator;
import it.cnr.isti.hpc.io.Serializer;
import it.cnr.isti.hpc.log.ProgressLogger;
import it.cnr.isti.hpc.wikipedia.article.Article;
import it.unimi.dsi.util.BloomFilter;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;

/**
 * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
 * 
 *         Created on Aug 8, 2014
 */
public class DocumentFrequencyGenerator {

    BloomFilter<Void> bf = BloomFilter.create(10000000L);
    DexterAnalyzer analyzer = new DexterAnalyzer();

    /**
     * Logger for this class
     */
    private static final Logger logger = LoggerFactory.getLogger(DocumentFrequencyGenerator.class);

    public DocumentFrequencyGenerator(Iterator<String> spotSrcTargetIterator) {
        File bloom = new File("/tmp/bf.bin");
        if (!bloom.exists()) {
            initBloomFilter(spotSrcTargetIterator);
            Serializer serializer = new Serializer();
            logger.info("dump bloom filter in {}", bloom.getAbsolutePath());
            serializer.dump(bf, bloom.getAbsolutePath());
        } else {
            Serializer serializer = new Serializer();
            logger.info("load bloom filter in {}", bloom.getAbsolutePath());
            bf = (BloomFilter<Void>) serializer.load(bloom.getAbsolutePath());
        }
    }

    private void initBloomFilter(Iterator<String> spotIterator) {
        String spot = spotIterator.next();
        analyzer.setShingles(false);

        ProgressLogger pl = new ProgressLogger("added {} spots to the bloom filter", 100000);
        pl.up();
        while (spotIterator.hasNext()) {
            String next = spotIterator.next();
            if (next.equals(spot))
                continue;
            pl.up();
            spot = next;
            TokenStream ts = null;
            try {
                ts = analyzer.tokenStream("content", new StringReader(spot));
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            try {
                ts.reset();

                if (ts.incrementToken()) {
                    spot = termAtt.toString();
                    bf.add(spot);

                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

    }

    Stopwatch watch = new Stopwatch();
    ArticleIterator iterator = new DexterAnalyzer.ArticleIterator();
    Multiset<String> freqs = HashMultiset.create();

    public Multiset<String> getSpotsAndFrequencies(Article a) {
        freqs.clear();
        try {
            iterator.setArticle(a);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        while (iterator.hasNext()) {
            String key = iterator.next();
            if (bf.contains(key)) {
                freqs.add(key);
            }

        }
        return freqs;

    }

}