it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java Source code

Introduction

Here is the source code for it.cnr.isti.hpc.dexter.analysis.SpotCleaner.java
Source

/**
 *  Copyright 2014 Diego Ceccarelli
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package it.cnr.isti.hpc.dexter.analysis;

import it.cnr.isti.hpc.dexter.spot.cleanpipe.filter.Filter;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.filter.ImageFilter;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.filter.NumberFilter;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.filter.SymbolFilter;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.filter.TemplateFilter;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.mapper.CityMapper;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.mapper.Mapper;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.mapper.QuotesMapper;
import it.cnr.isti.hpc.wikipedia.article.Article;
import it.cnr.isti.hpc.wikipedia.article.Link;

import java.io.IOException;
import java.io.StringReader;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

/**
 * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it>
 * 
 *         Created on Sep 21, 2014
 */
public class SpotCleaner {
    StringBuilder sb;
    List<Mapper<String>> mappers;
    List<Filter<String>> filters;
    SpotAnalyzer analyzer = new SpotAnalyzer();

    int maxSpotLength = 6;

    public SpotCleaner() {
        sb = new StringBuilder();
        mappers = new ArrayList<Mapper<String>>();
        mappers.add(new CityMapper());
        mappers.add(new QuotesMapper());

        filters = new ArrayList<Filter<String>>();
        filters.add(new NumberFilter());
        filters.add(new SymbolFilter());
        filters.add(new TemplateFilter());
        filters.add(new ImageFilter());
    }

    public String clean(String spot) throws IOException {
        try {
            spot = URLDecoder.decode(spot, "UTF-8");
        } catch (IllegalArgumentException e) {

        }

        analyzer.lowercase(spot.length() > 4);

        TokenStream ts = analyzer.tokenStream("content", new StringReader(spot));

        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        sb.setLength(0);
        int tokens = 0;
        while (ts.incrementToken()) {
            tokens++;
            sb.append(termAtt.toString());
            sb.append(' ');
            if (tokens > maxSpotLength) {
                return "";
            }
        }
        ts.end();
        ts.reset();
        if (sb.length() > 0)
            sb.setLength(sb.length() - 1);
        // System.out.println(spot + " -> " + "[" + sb.toString() + "]");
        String finalSpot = sb.toString();
        for (Filter<String> filter : filters) {
            if (filter.isFilter(finalSpot)) {
                finalSpot = "";
            }
        }
        return finalSpot;
    }

    public void enrich(String spot, Set<String> enriched) throws IOException {
        String cleanSpot = clean(spot);
        if (!cleanSpot.isEmpty()) {
            enriched.add(cleanSpot);
        }
        // for (Mapper<String> mapper : mappers) {
        // for (String s : mapper.map(spot)) {
        // String t = clean(s);
        // if (!t.isEmpty()) {
        // hashSet.add(clean(t));
        // }
        // }
        //
        // }
        return;
    }

    public Set<String> getAllSpots(Article a) throws IOException {
        Set<String> spots = new HashSet<String>();
        enrich(a.getTitle(), spots);
        if (a.isRedirect()) {
            enrich(a.getRedirectNoAnchor(), spots);
        } else {

            for (Link l : a.getLinks()) {
                enrich(l.getDescription(), spots);
            }
        }
        return spots;
    }

    public Set<String> enrich(String string) throws IOException {
        Set<String> spots = new HashSet<String>();
        enrich(string, spots);
        return spots;
    }

    public void getAllSpots(Article a, Set<String> spots) throws IOException {
        enrich(a.getTitle(), spots);
        if (a.isRedirect()) {
            enrich(a.getRedirectNoAnchor(), spots);
        } else {

            for (Link l : a.getLinks()) {
                enrich(l.getDescription(), spots);
            }
        }
    }
}