org.lobid.lodmill.hadoop.IntegrationTestCollectSubjects.java Source code

Java tutorial

Introduction

Here is the source code for org.lobid.lodmill.hadoop.IntegrationTestCollectSubjects.java

Source

/* Copyright 2013 Fabian Steeg. Licensed under the Eclipse Public License 1.0 */

package org.lobid.lodmill.hadoop;

import java.io.IOException;
import java.util.Scanner;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterMapReduceTestCase;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Utils;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.lobid.lodmill.hadoop.CollectSubjects.CollectSubjectsMapper;
import org.lobid.lodmill.hadoop.CollectSubjects.CollectSubjectsReducer;
import org.slf4j.LoggerFactory;

/**
 * Test {@link #CollectSubjects} job with blank nodes.
 * 
 * @author Fabian Steeg (fsteeg)
 */
@SuppressWarnings("javadoc")
public class IntegrationTestCollectSubjects extends ClusterMapReduceTestCase {
    private static final String TEST_FILE_1 = "src/test/resources/lobid-org-with-blank-nodes-1.nt";
    private static final String TEST_FILE_2 = "src/test/resources/lobid-org-with-blank-nodes-2.nt";
    private static final String HDFS_IN_1 = "blank-nodes-test/sample-1.nt";
    private static final String HDFS_IN_2 = "blank-nodes-test/sample-2.nt";
    private static final String HDFS_OUT = "out/sample";
    private FileSystem hdfs = null;

    @Before
    @Override
    public void setUp() throws Exception {
        System.setProperty("hadoop.log.dir", "/tmp/logs");
        super.setUp();
        hdfs = getFileSystem();
        hdfs.copyFromLocalFile(new Path(TEST_FILE_1), new Path(HDFS_IN_1));
        hdfs.copyFromLocalFile(new Path(TEST_FILE_2), new Path(HDFS_IN_2));
    }

    @Test
    public void testBlankNodeResolution() throws IOException, ClassNotFoundException, InterruptedException {
        final Job job = createJob();
        assertTrue("Job should complete successfully", job.waitForCompletion(true));
        final String string = readResults().toString();
        System.err.println("Collection output:\n" + string);
        assertEquals(
                " http://lobid.org/organisation/ACRPP,http://lobid.org/organisation/AAAAA\n"
                        + " http://lobid.org/organisation/ACRPP\n"
                        + "http://d-nb.info/gnd/129262110 http://lobid.org/organisation/ACRPP\n"
                        + "http://purl.org/lobid/fundertype#n08 http://lobid.org/organisation/ACRPP\n"
                        + "http://purl.org/lobid/stocksize#n06 http://lobid.org/organisation/ACRPP\n",
                string.replaceAll("_:[^\\s]+", ""));
        writeZippedMapFile();
    }

    private void writeZippedMapFile() throws IOException {
        long time = System.currentTimeMillis();
        final Path[] outputFiles = FileUtil.stat2Paths(
                getFileSystem().listStatus(new Path(HDFS_OUT), new Utils.OutputFileUtils.OutputFilesFilter()));
        final Path zipOutputLocation = new Path(HDFS_OUT + "/" + CollectSubjects.MAP_FILE_ZIP);
        CollectSubjects.asZippedMapFile(hdfs, outputFiles[0], zipOutputLocation);
        final FileStatus fileStatus = hdfs.getFileStatus(zipOutputLocation);
        assertTrue(fileStatus.getModificationTime() >= time);
    }

    private Job createJob() throws IOException {
        final JobConf conf = createJobConf();
        conf.setStrings("mapred.textoutputformat.separator", " ");
        conf.setStrings(CollectSubjects.PREFIX_KEY, "http://lobid.org/organisation");
        final Job job = new Job(conf);
        job.setJobName("CollectSubjects");
        FileInputFormat.addInputPaths(job, HDFS_IN_1 + "," + HDFS_IN_2);
        FileOutputFormat.setOutputPath(job, new Path(HDFS_OUT));
        job.setMapperClass(CollectSubjectsMapper.class);
        job.setReducerClass(CollectSubjectsReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        return job;
    }

    private StringBuilder readResults() throws IOException {
        final Path[] outputFiles = FileUtil.stat2Paths(
                getFileSystem().listStatus(new Path(HDFS_OUT), new Utils.OutputFileUtils.OutputFilesFilter()));
        assertEquals("Expect a single output file", 1, outputFiles.length);
        final StringBuilder builder = new StringBuilder();
        try (final Scanner scanner = new Scanner(getFileSystem().open(outputFiles[0]))) {
            while (scanner.hasNextLine())
                builder.append(scanner.nextLine()).append("\n");
        }
        return builder;
    }

    @Override
    @After
    public void tearDown() {
        try {
            hdfs.close();
            super.stopCluster();
        } catch (Exception e) {
            LoggerFactory.getLogger(IntegrationTestCollectSubjects.class).error(e.getMessage(), e);
        }
    }
}