org.apache.nutch.indexer.TestIndexerMapReduce.java Source code

Introduction

Here is the source code for org.apache.nutch.indexer.TestIndexerMapReduce.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.indexer;

import java.lang.invoke.MethodHandles;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.apache.hadoop.mrunit.types.Pair;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import static org.junit.Assert.*;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;

/** Test {@link IndexerMapReduce} */
public class TestIndexerMapReduce {

    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    public static String testUrl = "http://nutch.apache.org/";
    public static Text testUrlText = new Text(testUrl);
    public static String htmlContentType = "text/html";
    public static String testHtmlDoc = "<!DOCTYPE html>\n" + "<html>\n" + "<head>\n"
            + "<title>Test Indexing Binary Content</title>\n" + "<meta charset=\"utf-8\">\n"
            + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" />\n"
            + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des caractres\" />\n"
            + "<meta name=\"keywords\" lang=\"cs\" content=\"kdovn znak\" />\n" + "</head>\n" + "<body>\n"
            + "<p>\n" + "<ul>\n" + "  <li lang=\"en\">English: character set, encoding\n"
            + "  <li lang=\"fr\">Franais: codage des caractres\n"
            + "  <li lang=\"cs\">etina: kdovn znak (not covered by Latin-1)\n" + "</ul>\n" + "</body>\n"
            + "</html>";
    public static Metadata htmlMeta = new Metadata();
    static {
        htmlMeta.add("Content-Type", "text/html");
        // add segment and signature to avoid NPEs
        htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123");
        htmlMeta.add(Nutch.SIGNATURE_KEY, "123");
    }
    public static ParseText parseText = new ParseText("Test");
    public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "Test", new Outlink[] {},
            htmlMeta);
    public static CrawlDatum crawlDatumDbFetched = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24);
    public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24);

    private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new IndexerMapReduce.IndexerReducer();
    private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> reduceDriver;
    private Configuration configuration;

    /**
     * Test indexing of base64-encoded binary content.
     */
    @Test
    public void testBinaryContentBase64() {
        configuration = NutchConfiguration.create();
        configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);

        Charset[] testCharsets = { StandardCharsets.UTF_8, Charset.forName("iso-8859-1"),
                Charset.forName("iso-8859-2") };
        for (Charset charset : testCharsets) {
            LOG.info("Testing indexing binary content as base64 for charset {}", charset.name());

            String htmlDoc = testHtmlDoc;
            if (charset != StandardCharsets.UTF_8) {
                htmlDoc = htmlDoc.replaceAll("utf-8", charset.name());
                if (charset.name().equalsIgnoreCase("iso-8859-1")) {
                    // Western-European character set: remove Czech content
                    htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", "");
                } else if (charset.name().equalsIgnoreCase("iso-8859-2")) {
                    // Eastern-European character set: remove French content
                    htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", "");
                }
            }

            Content content = new Content(testUrl, testUrl, htmlDoc.getBytes(charset), htmlContentType, htmlMeta,
                    configuration);

            NutchDocument doc = runIndexer(crawlDatumDbFetched, crawlDatumFetchSuccess, parseText, parseData,
                    content);
            assertNotNull("No NutchDocument indexed", doc);

            String binaryContentBase64 = (String) doc.getField("binaryContent").getValues().get(0);
            LOG.info("binary content (base64): {}", binaryContentBase64);
            String binaryContent = new String(Base64.decodeBase64(binaryContentBase64), charset);
            LOG.info("binary content (decoded): {}", binaryContent);
            assertEquals("Binary content (" + charset + ") not correctly saved as base64", htmlDoc, binaryContent);
        }
    }

    /**
     * Run {@link IndexerMapReduce.reduce(...)} to get a &quot;indexed&quot;
     * {@link NutchDocument} by passing objects from segment and CrawlDb to the
     * indexer.
     *
     * @param dbDatum
     *          crawl datum from CrawlDb
     * @param fetchDatum
     *          crawl datum (fetch status) from segment
     * @param parseText
     *          plain text from parsed document
     * @param parseData
     *          parse data
     * @param content
     *          (optional, if index binary content) protocol content
     * @return &quot;indexed&quot; document
     */
    public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, ParseText parseText,
            ParseData parseData, Content content) {
        List<NutchWritable> values = new ArrayList<NutchWritable>();
        values.add(new NutchWritable(dbDatum));
        values.add(new NutchWritable(fetchDatum));
        values.add(new NutchWritable(parseText));
        values.add(new NutchWritable(parseData));
        values.add(new NutchWritable(content));
        reduceDriver = ReduceDriver.newReduceDriver(reducer);
        reduceDriver.getConfiguration().addResource(configuration);
        reduceDriver.withInput(testUrlText, values);
        List<Pair<Text, NutchIndexAction>> reduceResult;
        NutchDocument doc = null;
        try {
            reduceResult = reduceDriver.run();
            for (Pair<Text, NutchIndexAction> p : reduceResult) {
                if (p.getSecond().action != NutchIndexAction.DELETE) {
                    doc = p.getSecond().doc;
                }
            }
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
        }
        return doc;
    }

}