Example usage for org.apache.hadoop.mapreduce RecordReader getCurrentKey

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader getCurrentKey.

Prototype

public abstract KEYIN getCurrentKey() throws IOException, InterruptedException;

Source Link

Document

Get the current key

Usage

From source file:it.crs4.seal.tsv_sort.TextSampler.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 20 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param inFormat The input to sample// w  ww  .j  a  v a  2s . c  o m
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile)
        throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf);

    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT);
    List<InputSplit> splits = inFormat.getSplits(job);
    int samples = Math.min(MAX_SLICES_SAMPLED, splits.size());
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.size() / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        InputSplit isplit = splits.get(sampleStep * i);
        RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext);
        reader.initialize(isplit, taskContext);
        while (reader.nextKeyValue()) {
            sampler.addKey(reader.getCurrentKey());
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile))
        outFs.delete(partFile, false);

    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

From source file:org.apache.avro.mapreduce.TestAvroKeyRecordReader.java

License:Apache License

/**
 * Verifies that avro records can be read and progress is reported correctly.
 *///  w ww  .  j  av  a2 s  .  co  m
@Test
public void testReadRecords() throws IOException, InterruptedException {
    // Create the test avro file input with two records:
    //   1. "first"
    //   2. "second"
    final SeekableInput avroFileInput = new SeekableFileInput(
            AvroFiles.createFile(new File(mTempDir.getRoot(), "myStringfile.avro"),
                    Schema.create(Schema.Type.STRING), "first", "second"));

    // Create the record reader.
    Schema readerSchema = Schema.create(Schema.Type.STRING);
    RecordReader<AvroKey<CharSequence>, NullWritable> recordReader = new AvroKeyRecordReader<CharSequence>(
            readerSchema) {
        @Override
        protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException {
            return avroFileInput;
        }
    };

    // Set up the job configuration.
    Configuration conf = new Configuration();

    // Create a mock input split for this record reader.
    FileSplit inputSplit = createMock(FileSplit.class);
    expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes();
    expect(inputSplit.getStart()).andReturn(0L).anyTimes();
    expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes();

    // Create a mock task attempt context for this record reader.
    TaskAttemptContext context = createMock(TaskAttemptContext.class);
    expect(context.getConfiguration()).andReturn(conf).anyTimes();

    // Initialize the record reader.
    replay(inputSplit);
    replay(context);
    recordReader.initialize(inputSplit, context);

    assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f);

    // Some variables to hold the records.
    AvroKey<CharSequence> key;
    NullWritable value;

    // Read the first record.
    assertTrue("Expected at least one record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("First record had null key", key);
    assertNotNull("First record had null value", value);

    CharSequence firstString = key.datum();
    assertEquals("first", firstString.toString());

    assertTrue("getCurrentKey() returned different keys for the same record",
            key == recordReader.getCurrentKey());
    assertTrue("getCurrentValue() returned different values for the same record",
            value == recordReader.getCurrentValue());

    // Read the second record.
    assertTrue("Expected to read a second record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("Second record had null key", key);
    assertNotNull("Second record had null value", value);

    CharSequence secondString = key.datum();
    assertEquals("second", secondString.toString());

    assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(),
            0.0f);

    // There should be no more records.
    assertFalse("Expected only 2 records", recordReader.nextKeyValue());

    // Close the record reader.
    recordReader.close();

    // Verify the expected calls on the mocks.
    verify(inputSplit);
    verify(context);
}

From source file:org.apache.avro.mapreduce.TestAvroKeyValueRecordReader.java

License:Apache License

/**
 * Verifies that avro records can be read and progress is reported correctly.
 *///from  w  w  w.j a v  a2 s  . co  m
@Test
public void testReadRecords() throws IOException, InterruptedException {
    // Create the test avro file input with two records:
    //   1. <"firstkey", 1>
    //   2. <"second", 2>
    Schema keyValueSchema = AvroKeyValue.getSchema(Schema.create(Schema.Type.STRING),
            Schema.create(Schema.Type.INT));

    AvroKeyValue<CharSequence, Integer> firstInputRecord = new AvroKeyValue<CharSequence, Integer>(
            new GenericData.Record(keyValueSchema));
    firstInputRecord.setKey("first");
    firstInputRecord.setValue(1);

    AvroKeyValue<CharSequence, Integer> secondInputRecord = new AvroKeyValue<CharSequence, Integer>(
            new GenericData.Record(keyValueSchema));
    secondInputRecord.setKey("second");
    secondInputRecord.setValue(2);

    final SeekableInput avroFileInput = new SeekableFileInput(
            AvroFiles.createFile(new File(mTempDir.getRoot(), "myInputFile.avro"), keyValueSchema,
                    firstInputRecord.get(), secondInputRecord.get()));

    // Create the record reader over the avro input file.
    RecordReader<AvroKey<CharSequence>, AvroValue<Integer>> recordReader = new AvroKeyValueRecordReader<CharSequence, Integer>(
            Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)) {
        @Override
        protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException {
            return avroFileInput;
        }
    };

    // Set up the job configuration.
    Configuration conf = new Configuration();

    // Create a mock input split for this record reader.
    FileSplit inputSplit = createMock(FileSplit.class);
    expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes();
    expect(inputSplit.getStart()).andReturn(0L).anyTimes();
    expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes();

    // Create a mock task attempt context for this record reader.
    TaskAttemptContext context = createMock(TaskAttemptContext.class);
    expect(context.getConfiguration()).andReturn(conf).anyTimes();

    // Initialize the record reader.
    replay(inputSplit);
    replay(context);
    recordReader.initialize(inputSplit, context);

    assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f);

    // Some variables to hold the records.
    AvroKey<CharSequence> key;
    AvroValue<Integer> value;

    // Read the first record.
    assertTrue("Expected at least one record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("First record had null key", key);
    assertNotNull("First record had null value", value);

    assertEquals("first", key.datum().toString());
    assertEquals(1, value.datum().intValue());

    assertTrue("getCurrentKey() returned different keys for the same record",
            key == recordReader.getCurrentKey());
    assertTrue("getCurrentValue() returned different values for the same record",
            value == recordReader.getCurrentValue());

    // Read the second record.
    assertTrue("Expected to read a second record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("Second record had null key", key);
    assertNotNull("Second record had null value", value);

    assertEquals("second", key.datum().toString());
    assertEquals(2, value.datum().intValue());

    assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(),
            0.0f);

    // There should be no more records.
    assertFalse("Expected only 2 records", recordReader.nextKeyValue());

    // Close the record reader.
    recordReader.close();

    // Verify the expected calls on the mocks.
    verify(inputSplit);
    verify(context);
}

From source file:org.apache.crunch.kafka.inputformat.KafkaInputFormatIT.java

License:Apache License

@Test
public void getSplitsCreateReaders() throws IOException, InterruptedException {
    List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
    Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(),
            topic);//w  ww . j a v  a  2  s .c  om
    Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);

    Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
    for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
        Long endingOffset = endOffsets.get(entry.getKey());
        offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
    }

    KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);

    KafkaInputFormat inputFormat = new KafkaInputFormat();
    inputFormat.setConf(config);
    List<InputSplit> splits = inputFormat.getSplits(null);

    assertThat(splits.size(), is(offsets.size()));

    for (InputSplit split : splits) {
        KafkaInputSplit inputSplit = (KafkaInputSplit) split;
        Pair<Long, Long> startEnd = offsets.get(inputSplit.getTopicPartition());
        assertThat(inputSplit.getStartingOffset(), is(startEnd.first()));
        assertThat(inputSplit.getEndingOffset(), is(startEnd.second()));
    }

    //create readers and consume the data
    when(taskContext.getConfiguration()).thenReturn(config);
    Set<String> keysRead = new HashSet<>();
    //read all data from all splits
    for (InputSplit split : splits) {
        KafkaInputSplit inputSplit = (KafkaInputSplit) split;
        long start = inputSplit.getStartingOffset();
        long end = inputSplit.getEndingOffset();

        RecordReader<BytesWritable, BytesWritable> recordReader = inputFormat.createRecordReader(split,
                taskContext);
        recordReader.initialize(split, taskContext);

        int numRecordsFound = 0;
        String currentKey;
        while (recordReader.nextKeyValue()) {
            currentKey = new String(recordReader.getCurrentKey().getBytes());
            keysRead.add(currentKey);
            assertThat(keys, hasItem(currentKey));
            assertThat(recordReader.getCurrentValue(), is(notNullValue()));
            numRecordsFound++;
        }
        recordReader.close();

        //assert that it encountered a partitions worth of data
        assertThat(((long) numRecordsFound), is(end - start));
    }

    //validate the same number of unique keys was read as were written.
    assertThat(keysRead.size(), is(keys.size()));
}

From source file:org.apache.hadoop.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample/*from  w w w  . j av  a  2s  . c  o  m*/
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(),
            TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(
            conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS),
            splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:org.apache.hcatalog.pig.TestE2EScenarios.java

License:Apache License

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);/*  w w w.  j a v a 2s  .  c o m*/
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));

    // Test HCatContext

    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get()
                .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION,
                        HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }

    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);

    for (InputSplit split : inpy.getSplits(ijob)) {

        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());

        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);

        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);

        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }

    oc.commitJob(ojob);
}

From source file:org.apache.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final List<FileSplit> inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();
        private ContextFactory ctxFactory = new ContextFactory();

        @SuppressWarnings("unchecked")
        @Override/*  w ww .  j  av  a2s  .c  o m*/
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                Job job = confFactory.getConf();
                job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(),
                        job.getConfiguration());
                int size = inputSplits.size();
                for (int i = 0; i < size; i++) {
                    /**
                     * read all the partitions scheduled to the current node
                     */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                         * pick an unread split to read synchronize among
                         * simultaneous partitions in the same machine
                         */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }

                        /**
                         * read the split
                         */
                        TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
                        context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                        RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                        reader.initialize(inputSplits.get(i), context);
                        while (reader.nextKeyValue() == true) {
                            parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer,
                                    inputSplits.get(i).toString());
                        }
                    }
                }
                parser.close(writer);
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes with the same identity in a single
 * file, splits them over two files and checks that we can workaround
 * JENA-820 successfully by setting the/* w  w  w  .  j a  v a  2 s  .c  o  m*/
 * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
 * 
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);

        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));

        // Enabling this flag works around the JENA-820 issue
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should not have diverged
        Assert.assertEquals(1, nodes.size());

    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes with the same identity in a single
 * file, splits them over two files and shows that they diverge in the
 * subsequent job when the JENA-820 workaround is not enabled
 * /*  w w w  .ja v  a2s  . c o m*/
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void blank_node_divergence_02() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);

        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines. However it is the default behaviour
        // because when we start from external inputs we want them to be
        // file scoped.
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));

        // Make sure JENA-820 flag is disabled
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should have diverged
        Assert.assertEquals(2, nodes.size());

    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes in two different files and checks
 * that writing them to a single file does not conflate them
 * /*from ww  w. ja  v  a2s  . c om*/
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void blank_node_identity_01() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_identity", getInitialInputExtension());
    File b = File.createTempFile("bnode_identity", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Different blank nodes in different files
        List<T> tuples = new ArrayList<>();
        Node bnode1 = NodeFactory.createBlankNode();
        Node bnode2 = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");

        tuples.add(createTuple(bnode1, pred, NodeFactory.createLiteral("first")));
        writeTuples(a, tuples);

        tuples.clear();
        tuples.add(createTuple(bnode2, pred, NodeFactory.createLiteral("second")));
        writeTuples(b, tuples);

        // Set up fake job which will process the two files
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Prepare the output writing - putting all output to a single file
        OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
        TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                createAttemptID(1, 2, 1));
        RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - all triples go to a single
            // output
            while (reader.nextKeyValue()) {
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
            }
        }
        writer.close(outputTaskContext);

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // The Blank nodes should have been given separate identities so we
        // should not be conflating them, this is the opposite problem to
        // that described in JENA-820
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes must not have converged
        Assert.assertEquals(2, nodes.size());

    } finally {
        a.delete();
        b.delete();
        deleteDirectory(intermediateOutputDir);
    }
}