bixo.examples.crawl.LatestUrlDatumBufferTest.java Source code

Introduction

Here is the source code for bixo.examples.crawl.LatestUrlDatumBufferTest.java
Source

/*
 * Copyright 2009-2012 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.examples.crawl;

import static org.junit.Assert.assertEquals;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.junit.Before;
import org.junit.Test;

import bixo.datum.UrlDatum;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.scheme.SequenceFile;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;

import com.bixolabs.cascading.HadoopUtils;

@SuppressWarnings("deprecation")
public class LatestUrlDatumBufferTest {

    private static final String WORKINGDIR = "build/test/LatestUrlDatumBufferTest";
    private static final Path _workingDirPath = new Path(WORKINGDIR);
    private JobConf _conf = new JobConf();

    @Before
    public void setUp() throws IOException {

        File workingFolder = new File(WORKINGDIR);
        if (workingFolder.exists()) {
            FileUtils.deleteDirectory(workingFolder);
        }
    }

    /*  Can't use the test below since it doesn't simulate the reusing of tuples in a Cascading
     *  GroupBy operation. 
     *  In particular it will fail to catch a case where an assignment of the type 
     *      aDatum = datum 
     *  is being incorrectly done.
     *  Instead we want it to be 
     *      aDatum = new DatumType(datum)
    */
    /*
    @Test
    public void testOperate() throws BaseFetchException, IOException {
    LatestUrlDatumBuffer op = new LatestUrlDatumBuffer();
        
    HadoopFlowProcess fp = Mockito.mock(HadoopFlowProcess.class);
    Mockito.when(fp.getJobConf()).thenReturn(new JobConf());
        
    OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
    BufferCall<NullContext> bc = Mockito.mock(BufferCall.class);
    TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class);
        
    List<TupleEntry> tupleEntryList = new ArrayList<TupleEntry>();
    UrlDatum urlDatum1 = new UrlDatum("http://foo.com");
    urlDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
    urlDatum1.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.UNFETCHED);
    TupleEntry entry1 = new TupleEntry(UrlDatum.FIELDS);
    entry1.setTuple(urlDatum1.getTuple());
    tupleEntryList.add(entry1);
        
    UrlDatum urlDatum2 = new UrlDatum("http://foo.com");
    urlDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
    urlDatum2.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.FETCHED);
    TupleEntry entry2 = new TupleEntry(UrlDatum.FIELDS);
    entry2.setTuple(urlDatum2.getTuple());
    tupleEntryList.add(entry2);
        
    UrlDatum urlDatum3 = new UrlDatum("http://foo.com");
    urlDatum3.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
    urlDatum3.setPayloadValue(CrawlDbDatum.LAST_STATUS_FIELD, UrlStatus.UNFETCHED);
    TupleEntry entry3 = new TupleEntry(UrlDatum.FIELDS);
    entry3.setTuple(urlDatum3.getTuple());
    tupleEntryList.add(entry3);
    Mockito.when(bc.getArgumentsIterator()).thenReturn(tupleEntryList.iterator());
    Mockito.when(bc.getOutputCollector()).thenReturn(collector);
        
    op.prepare(fp, oc);
    op.operate(fp, bc);
    op.cleanup(fp, oc);
        
    Mockito.verify(collector, Mockito.times(1)).add(Mockito.argThat(new MatchUrlDatum()));
    Mockito.verifyNoMoreInteractions(collector);
        
    }
        
    private static class MatchUrlDatum extends ArgumentMatcher<Tuple> {
        
    @Override
    public boolean matches(Object argument) {
        TupleEntry entry = new TupleEntry(UrlDatum.FIELDS);
        entry.setTuple((Tuple)argument);
        UrlDatum datum = new UrlDatum(entry);
        Long expectedVal = new Long(2);
        Long result = (Long)datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
        if (result.longValue() == expectedVal.longValue()) {
            return true;
        }
        return false;
    }
    }
        
    */

    @Test
    public void testOperateWithGroupBy() throws IOException {

        // Create a temp file with a fetched url
        Path fetchedDatumsPath = new Path(_workingDirPath, "fetched");
        ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com");
        fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
        fetchedDatums.add(fetchedDatum1);
        createDataFile(fetchedDatumsPath.toString(), fetchedDatums);

        // And another with unfetched urls
        Path unfetchedDatumsPath = new Path(_workingDirPath, "unfetched");
        ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>();
        UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com");
        unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum1);
        UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com");
        unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
        unfetchedDatums.add(unfetchedDatum2);

        createDataFile(unfetchedDatumsPath.toString(), unfetchedDatums);

        // create a workflow
        Tap inputSource1 = new Hfs(new SequenceFile(UrlDatum.FIELDS), fetchedDatumsPath.toString());
        Pipe fetchedPipe = new Pipe("fetched");
        Tap inputSource2 = new Hfs(new SequenceFile(UrlDatum.FIELDS), unfetchedDatumsPath.toString());
        Pipe unfetchedPipe = new Pipe("unfetched");

        Map<String, Tap> sources = new HashMap<String, Tap>();
        sources.put(fetchedPipe.getName(), inputSource1);
        sources.put(unfetchedPipe.getName(), inputSource2);

        Path resultsPath = new Path(_workingDirPath, "results");
        Tap resultSink = new Hfs(new SequenceFile(UrlDatum.FIELDS), resultsPath.toString(), true);

        Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe),
                new Fields(UrlDatum.URL_FN));
        resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

        Properties props = HadoopUtils.getDefaultProperties(LatestUrlDatumBufferTest.class, false, _conf);

        FlowConnector flowConnector = new FlowConnector(props);
        Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
        flow.complete();

        // verify that the resulting pipe has the latest tuple

        Tap testSink = new Hfs(new SequenceFile(UrlDatum.FIELDS), resultsPath.toString(), false);
        TupleEntryIterator reader = testSink.openForRead(_conf);
        int count = 0;
        long latest = 0;
        while (reader.hasNext()) {
            TupleEntry next = reader.next();
            UrlDatum datum = new UrlDatum(next);
            latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
            count++;
        }

        assertEquals(1, count);
        assertEquals(2, latest);

    }

    private void createDataFile(String fileName, List<UrlDatum> datums) throws IOException {
        Tap urlSink = new Hfs(new SequenceFile(UrlDatum.FIELDS), fileName, true);
        TupleEntryCollector writer = urlSink.openForWrite(_conf);
        for (UrlDatum datum : datums) {
            writer.add(datum.getTuple());
        }
        writer.close();
    }
}