babel.prep.langid.LangIdMapper.java Source code

Java tutorial

Introduction

Here is the source code for babel.prep.langid.LangIdMapper.java

Source

/**
 * This file is licensed to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package babel.prep.langid;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

import babel.content.pages.Page;
import babel.content.pages.PageVersion;

import babel.util.language.GoogleLangDetector;
import babel.util.language.LangDetectionResult;
import babel.util.language.LangDetector;
import babel.util.language.Language;

public class LangIdMapper extends MapReduceBase implements Mapper<Text, Page, Text, Page> {
    static final Log LOG = LogFactory.getLog(LangIdMapper.class);

    /**
     * Sets up the language detector.
     */
    public void configure(JobConf job) {
        String referrer = job.get(LangIdentifier.JOB_PROP_JOB_REFERRER);

        // m_detector = new NutchLangDetector(job); -OR-
        m_detector = new GoogleLangDetector(referrer);
    }

    @Override
    public void map(Text url, Page page, OutputCollector<Text, Page> output, Reporter reporter) throws IOException {
        if (page.getLanguage() == null) {
            String lang = detectLang(page);
            if (lang != null) {
                LangIdentifier.Stats.incLangPageCount(lang);
            } else {
                LangIdentifier.Stats.incFailedCount();
            }
        } else {
            LangIdentifier.Stats.incOldLangPageCount();
        }

        output.collect(url, page);
    }

    public String detectLang(Page page) {
        Language lang = null;
        LangDetectionResult langResult;

        // TODO: May help to be more sophisticated, but for now - grab content of 
        // the first available version and run it through the language identifier    
        try {
            String content;

            for (PageVersion ver : page.pageVersions()) {
                if ((content = ver.getContent()).length() > 0) {
                    langResult = m_detector.detect(content);

                    if (LOG.isInfoEnabled()) {
                        LOG.info("Language " + langResult.language().toString() + " for page " + page.pageURL());
                    }

                    if (langResult.language() != null && langResult.isReliable()) {
                        page.setLanguage(lang = langResult.language());
                    }

                    break;
                }
            }
        } catch (Exception e) {
            if (LangIdentifier.LOG.isErrorEnabled()) {
                LangIdentifier.LOG
                        .error("Failed to detect language for page " + page.pageURL() + ": " + e.toString());
            }
        }

        return lang == null ? null : lang.toString();
    }

    private LangDetector m_detector;
}