Example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue

List of usage examples for org.apache.hadoop.mapreduce RecordReader nextKeyValue

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue.

Prototype

public abstract boolean nextKeyValue() throws IOException, InterruptedException;

Source Link

Document

Read the next key, value pair.

Usage

From source file:org.apache.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final List<FileSplit> inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();
        private ContextFactory ctxFactory = new ContextFactory();

        @SuppressWarnings("unchecked")
        @Override/*w  w  w . ja v  a2  s .  c  o m*/
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                Job job = confFactory.getConf();
                job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(),
                        job.getConfiguration());
                int size = inputSplits.size();
                for (int i = 0; i < size; i++) {
                    /**
                     * read all the partitions scheduled to the current node
                     */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                         * pick an unread split to read synchronize among
                         * simultaneous partitions in the same machine
                         */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }

                        /**
                         * read the split
                         */
                        TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
                        context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                        RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                        reader.initialize(inputSplits.get(i), context);
                        while (reader.nextKeyValue() == true) {
                            parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer,
                                    inputSplits.get(i).toString());
                        }
                    }
                }
                parser.close(writer);
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java

License:Apache License

protected final int countTuples(RecordReader<LongWritable, T> reader) throws IOException, InterruptedException {
    int count = 0;

    // Check initial progress
    LOG.info(String.format("Initial Reported Progress %f", reader.getProgress()));
    float progress = reader.getProgress();
    if (Float.compare(0.0f, progress) == 0) {
        Assert.assertEquals(0.0d, reader.getProgress(), 0.0d);
    } else if (Float.compare(1.0f, progress) == 0) {
        // If reader is reported 1.0 straight away then we expect there to
        // be no key values
        Assert.assertEquals(1.0d, reader.getProgress(), 0.0d);
        Assert.assertFalse(reader.nextKeyValue());
    } else {/*www. j  av  a2s.  co  m*/
        Assert.fail(String.format(
                "Expected progress of 0.0 or 1.0 before reader has been accessed for first time but got %f",
                progress));
    }

    // Count tuples
    boolean debug = LOG.isDebugEnabled();
    while (reader.nextKeyValue()) {
        count++;
        progress = reader.getProgress();
        if (debug)
            LOG.debug(String.format("Current Reported Progress %f", progress));
        Assert.assertTrue(String.format("Progress should be in the range 0.0 < p <= 1.0 but got %f", progress),
                progress > 0.0f && progress <= 1.0f);
    }
    reader.close();
    LOG.info(String.format("Got %d tuples from this record reader", count));

    // Check final progress
    LOG.info(String.format("Final Reported Progress %f", reader.getProgress()));
    Assert.assertEquals(1.0d, reader.getProgress(), 0.0d);

    return count;
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes with the same identity in a single
 * file, splits them over two files and checks that we can workaround
 * JENA-820 successfully by setting the/*from  w  w  w.jav a 2s .c o  m*/
 * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
 * 
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);

        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));

        // Enabling this flag works around the JENA-820 issue
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should not have diverged
        Assert.assertEquals(1, nodes.size());

    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes with the same identity in a single
 * file, splits them over two files and shows that they diverge in the
 * subsequent job when the JENA-820 workaround is not enabled
 * /* ww  w. j  a v a  2s  . com*/
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void blank_node_divergence_02() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);

        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines. However it is the default behaviour
        // because when we start from external inputs we want them to be
        // file scoped.
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));

        // Make sure JENA-820 flag is disabled
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should have diverged
        Assert.assertEquals(2, nodes.size());

    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes in two different files and checks
 * that writing them to a single file does not conflate them
 * //from w  w w.  j  a  v a 2  s  .co m
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void blank_node_identity_01() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_identity", getInitialInputExtension());
    File b = File.createTempFile("bnode_identity", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Different blank nodes in different files
        List<T> tuples = new ArrayList<>();
        Node bnode1 = NodeFactory.createBlankNode();
        Node bnode2 = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");

        tuples.add(createTuple(bnode1, pred, NodeFactory.createLiteral("first")));
        writeTuples(a, tuples);

        tuples.clear();
        tuples.add(createTuple(bnode2, pred, NodeFactory.createLiteral("second")));
        writeTuples(b, tuples);

        // Set up fake job which will process the two files
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Prepare the output writing - putting all output to a single file
        OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
        TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                createAttemptID(1, 2, 1));
        RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - all triples go to a single
            // output
            while (reader.nextKeyValue()) {
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
            }
        }
        writer.close(outputTaskContext);

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // The Blank nodes should have been given separate identities so we
        // should not be conflating them, this is the opposite problem to
        // that described in JENA-820
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes must not have converged
        Assert.assertEquals(2, nodes.size());

    } finally {
        a.delete();
        b.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes in two different files and checks
 * that writing them to a single file does not conflate them
 * //from   www . j ava  2  s .  c  o m
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void blank_node_identity_02() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_identity", getInitialInputExtension());
    File b = File.createTempFile("bnode_identity", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Same blank node but in different files so must be treated as
        // different blank nodes and not converge
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");

        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        writeTuples(a, tuples);

        tuples.clear();
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(b, tuples);

        // Set up fake job which will process the two files
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Prepare the output writing - putting all output to a single file
        OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
        TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                createAttemptID(1, 2, 1));
        RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - all triples go to a single
            // output
            while (reader.nextKeyValue()) {
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
            }
        }
        writer.close(outputTaskContext);

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // The Blank nodes should have been given separate identities so we
        // should not be conflating them, this is the opposite problem to
        // that described in JENA-820
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes must not diverge
        Assert.assertEquals(2, nodes.size());

    } finally {
        a.delete();
        b.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.tdbloader4.partitioners.SplitSampler.java

License:Apache License

/**
 * From each split sampled, take the first numSamples / numSplits records.
 *//*from w w w . j a v a 2  s. c  o m*/
@SuppressWarnings("unchecked")
// ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
    List<InputSplit> splits = inf.getSplits(job);
    ArrayList<K> samples = new ArrayList<K>(numSamples);
    int splitsToSample = Math.min(maxSplitsSampled, splits.size());
    int samplesPerSplit = numSamples / splitsToSample;
    log.debug("Sampling {} splits, taking {} samples per split", splitsToSample, samplesPerSplit);
    long records = 0;
    for (int i = 0; i < splitsToSample; ++i) {
        TaskAttemptContext samplingContext = new TaskAttemptContext(job.getConfiguration(),
                new TaskAttemptID());
        InputSplit split = splits.get(i);
        log.debug("Sampling {} split", split);
        RecordReader<K, V> reader = inf.createRecordReader(split, samplingContext);
        reader.initialize(split, samplingContext);
        while (reader.nextKeyValue()) {
            LongQuadWritable currentKey = (LongQuadWritable) reader.getCurrentKey();
            // TODO: why do we need to do that? Why on earth we have -1 in subject, predicate or object position???
            if ((currentKey.get(0) > 0) && (currentKey.get(1) > 0) && (currentKey.get(2) > 0)) {
                LongQuadWritable key = new LongQuadWritable(currentKey.get(0), currentKey.get(1),
                        currentKey.get(2), currentKey.get(3));
                log.debug("Sampled {}", key);
                samples.add((K) key);
                ++records;
                if (records >= (i + 1) * samplesPerSplit) {
                    log.debug("Records is {} and (i + 1) * samplesPerSplit is {}", records,
                            (i + 1) * samplesPerSplit);
                    break;
                }
            }
        }
        reader.close();
    }
    return (K[]) samples.toArray();
}

From source file:org.apache.kudu.mapreduce.ITKuduTableInputFormat.java

License:Apache License

@Test
public void test() throws Exception {
    createTable(TABLE_NAME, getBasicSchema(), getBasicCreateTableOptions());

    KuduTable table = openTable(TABLE_NAME);
    Schema schema = getBasicSchema();
    Insert insert = table.newInsert();//from   w w  w.j av a 2 s . com
    PartialRow row = insert.getRow();
    row.addInt(0, 1);
    row.addInt(1, 2);
    row.addInt(2, 3);
    row.addString(3, "a string");
    row.addBoolean(4, true);
    AsyncKuduSession session = client.newSession();
    session.apply(insert).join(DEFAULT_SLEEP);
    session.close().join(DEFAULT_SLEEP);

    // Test getting all the columns back
    RecordReader<NullWritable, RowResult> reader = createRecordReader("*", null);
    assertTrue(reader.nextKeyValue());
    assertEquals(5, reader.getCurrentValue().getColumnProjection().getColumnCount());
    assertFalse(reader.nextKeyValue());

    // Test getting two columns back
    reader = createRecordReader(
            schema.getColumnByIndex(3).getName() + "," + schema.getColumnByIndex(2).getName(), null);
    assertTrue(reader.nextKeyValue());
    assertEquals(2, reader.getCurrentValue().getColumnProjection().getColumnCount());
    assertEquals("a string", reader.getCurrentValue().getString(0));
    assertEquals(3, reader.getCurrentValue().getInt(1));
    try {
        reader.getCurrentValue().getString(2);
        fail("Should only be getting 2 columns back");
    } catch (IndexOutOfBoundsException e) {
        // expected
    }

    // Test getting one column back
    reader = createRecordReader(schema.getColumnByIndex(1).getName(), null);
    assertTrue(reader.nextKeyValue());
    assertEquals(1, reader.getCurrentValue().getColumnProjection().getColumnCount());
    assertEquals(2, reader.getCurrentValue().getInt(0));
    try {
        reader.getCurrentValue().getString(1);
        fail("Should only be getting 1 column back");
    } catch (IndexOutOfBoundsException e) {
        // expected
    }

    // Test getting empty rows back
    reader = createRecordReader("", null);
    assertTrue(reader.nextKeyValue());
    assertEquals(0, reader.getCurrentValue().getColumnProjection().getColumnCount());
    assertFalse(reader.nextKeyValue());

    // Test getting an unknown table, will not work
    try {
        createRecordReader("unknown", null);
        fail("Should not be able to scan a column that doesn't exist");
    } catch (IllegalArgumentException e) {
        // expected
    }

    // Test using a predicate that filters the row out.
    KuduPredicate pred1 = KuduPredicate.newComparisonPredicate(schema.getColumnByIndex(1),
            KuduPredicate.ComparisonOp.GREATER_EQUAL, 3);
    reader = createRecordReader("*", Lists.newArrayList(pred1));
    assertFalse(reader.nextKeyValue());
}

From source file:org.apache.mahout.classifier.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);//  www  .  j a  va 2 s . c o  m
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    /* first instance id in hadoop's order */
    //int[] firstIds = new int[nbSplits];
    /* partitions' sizes in hadoop order */
    int[] sizes = new int[nbSplits];

    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (InputSplit split : splits) {
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
        reader.initialize(split, task);

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees);

        long time = System.currentTimeMillis();

        //firstIds[hp] = firstId;

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
            firstId++;
            sizes[hp]++;
        }

        mapper.cleanup(firstOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
    return true;
}

From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);/*from   w ww  . j  a va2  s  . c  o m*/
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    firstIds = new int[nbSplits];
    sizes = new int[nbSplits];

    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (int p = 0; p < nbSplits; p++) {
        InputSplit split = splits.get(p);
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
        reader.initialize(split, task);

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees);

        long time = System.currentTimeMillis();

        firstIds[hp] = firstId;

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
            firstId++;
            sizes[hp]++;
        }

        mapper.cleanup(firstOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
    return true;
}