Example usage for org.apache.hadoop.mapreduce RecordReader getCurrentKey

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader getCurrentKey.

Prototype

public abstract KEYIN getCurrentKey() throws IOException, InterruptedException;

Source Link

Document

Get the current key

Usage

From source file:co.cask.cdap.template.etl.common.ETLDBInputFormat.java

License:Apache License

@Override
protected RecordReader createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException {
    final RecordReader dbRecordReader = super.createDBRecordReader(split, conf);
    return new RecordReader() {
        @Override/*ww w .j a v a 2 s .c  o  m*/
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
            dbRecordReader.initialize(split, context);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return dbRecordReader.nextKeyValue();
        }

        @Override
        public Object getCurrentKey() throws IOException, InterruptedException {
            return dbRecordReader.getCurrentKey();
        }

        @Override
        public Object getCurrentValue() throws IOException, InterruptedException {
            return dbRecordReader.getCurrentValue();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return dbRecordReader.getProgress();
        }

        @Override
        public void close() throws IOException {
            dbRecordReader.close();
            try {
                DriverManager.deregisterDriver(driverShim);
            } catch (SQLException e) {
                throw new IOException(e);
            }
        }
    };
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java

License:Apache License

@Test
public void testOneFile() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile = new Path(dir, "file1.txt");

    writeSequenceFile(inputFile);/*from  ww  w  . jav  a  2 s  .co  m*/

    Job job = new Job(new JobConf());

    FileInputFormat.addInputPath(job, inputFile);

    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size());

    TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0);
    Configuration conf1 = new Configuration();
    TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId);

    RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1);
    rr.initialize(splits.get(0), context1);
    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertFalse(rr.nextKeyValue());
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java

License:Apache License

@Test
public void testTwoFiles() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile1 = new Path(dir, "file1.txt");
    Path inputFile2 = new Path(dir, "file2.txt");

    writeSequenceFile(inputFile1);//from  w  w w .ja  v  a 2 s .  co m
    writeSequenceFile(inputFile2);

    Job job = new Job(new JobConf());

    FileInputFormat.addInputPath(job, inputFile1);
    FileInputFormat.addInputPath(job, inputFile2);

    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size());

    TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0);
    Configuration conf1 = new Configuration();
    TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId);

    RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1);
    rr.initialize(splits.get(0), context1);
    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertEquals(0.5f, rr.getProgress(), 0.1);

    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertFalse(rr.nextKeyValue());
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample//from   w  w w . j a v  a2  s .  c om
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            @Override
            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java

License:Apache License

public void testSplits(long maxSplitSize, int generatedRows) throws IOException, InterruptedException,
        IllegalArgumentException, SecurityException, ClassNotFoundException, InstantiationException,
        IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    logger.info("Testing maxSplitSize: " + maxSplitSize + " and generatedRows:" + generatedRows);
    FileSystem fS = FileSystem.get(getConf());
    Random r = new Random(1);
    Schema schema = new Schema("schema", Fields.parse("i:int,s:string"));
    ITuple tuple = new Tuple(schema);

    Path outPath = new Path(OUT);
    TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(getConf()), getConf(), outPath, schema);
    for (int i = 0; i < generatedRows; i++) {
        tuple.set("i", r.nextInt());
        tuple.set("s", r.nextLong() + "");
        writer.append(tuple);//from  w w  w  . j ava  2s  . c  o  m
    }
    writer.close();

    TupleInputFormat format = ReflectionUtils.newInstance(TupleInputFormat.class, getConf());
    Job job = new Job(getConf());
    FileInputFormat.setInputPaths(job, outPath);
    logger.info("Using max input split size: " + maxSplitSize);
    FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);
    job.setInputFormatClass(FileInputFormat.class);

    // Read all the splits and count. The number of read rows must
    // be the same than the written ones.
    int count = 0;
    for (InputSplit split : format.getSplits(job)) {
        TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);
        TaskAttemptContext attemptContext = TaskAttemptContextFactory.get(getConf(), attemptId);
        logger.info("Sampling split: " + split);
        RecordReader<ITuple, NullWritable> reader = format.createRecordReader(split, attemptContext);
        reader.initialize(split, attemptContext);
        while (reader.nextKeyValue()) {
            tuple = reader.getCurrentKey();
            count++;
        }
        reader.close();
    }

    assertEquals(generatedRows, count);

    HadoopUtils.deleteIfExists(fS, outPath);
}

From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.TestAleph2EsInputFormat.java

License:Apache License

@Test
public void test_Aleph2EsRecordReader_objectConversion() throws IOException, InterruptedException {

    @SuppressWarnings("rawtypes")
    final RecordReader mock_shard_record_reader = Mockito.mock(RecordReader.class);

    // mock returns Text key, MapWritable value
    Mockito.when(mock_shard_record_reader.getCurrentKey()).thenReturn(new Text("text_test"));

    final MapWritable test_out = new MapWritable();
    test_out.put(new Text("val_key_text"), new Text("val_val_text"));

    Mockito.when(mock_shard_record_reader.getCurrentValue()).thenReturn(test_out);

    final Aleph2EsRecordReader reader_under_test = new Aleph2EsRecordReader(mock_shard_record_reader);

    final String key = reader_under_test.getCurrentKey();
    assertEquals(String.class, key.getClass());
    assertEquals("text_test", key);

    final Tuple2<Long, IBatchRecord> value = reader_under_test.getCurrentValue();
    assertEquals(0L, value._1().longValue()); // (so something breaks in here when/if we put some logic in)
    assertEquals(Optional.empty(), value._2().getContent());
    final JsonNode json_val = value._2().getJson();
    assertTrue("Is object: " + json_val, json_val.isObject());
    assertEquals("val_val_text", json_val.get("val_key_text").asText());
    assertEquals("text_test", json_val.get("_id").asText());
}

From source file:com.ikanow.aleph2.v1.document_db.hadoop.assets.TestAleph2V1InputFormat.java

License:Apache License

@Test
public void test_V1DocumentDbRecordReader_objectConversion() throws IOException, InterruptedException {

    @SuppressWarnings("unchecked")
    final RecordReader<Object, BSONObject> mock_record_reader = (RecordReader<Object, BSONObject>) Mockito
            .mock(RecordReader.class);
    Mockito.when(mock_record_reader.getCurrentKey()).thenReturn("text_test");
    final BasicDBObject test_ret = new BasicDBObject();
    test_ret.put("val_key_text", "val_val_text");
    Mockito.when(mock_record_reader.getCurrentValue()).thenReturn(test_ret);

    try (final V1DocumentDbRecordReader reader_under_test = new V1DocumentDbRecordReader(mock_record_reader)) {

        final String key = reader_under_test.getCurrentKey();
        assertEquals(String.class, key.getClass());
        assertEquals("text_test", key);

        final Tuple2<Long, IBatchRecord> value = reader_under_test.getCurrentValue();
        assertEquals(0L, value._1().longValue()); // (so something breaks in here when/if we put some logic in)
        assertEquals(Optional.empty(), value._2().getContent());
        final JsonNode json_val = value._2().getJson();
        assertTrue("Is object: " + json_val, json_val.isObject());
        assertEquals("val_val_text", json_val.get("val_key_text").asText());
    }/* w ww  .  j  a  v a 2 s  .c  o m*/
}

From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * /*from   w ww .j  a va 2s. c o  m*/
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.splicemachine.derby.impl.io.WholeTextInputFormatTest.java

License:Apache License

private long collectRecords(Set<String> fileNames, RecordReader<String, InputStream> recordReader)
        throws IOException, InterruptedException {
    long count = 0L;
    while (recordReader.nextKeyValue()) {
        String key = recordReader.getCurrentKey();
        key = key.replaceAll("/+", "/"); // some platforms add more "/" at the beginning, coalesce them for equality check
        Assert.assertTrue("Seen the same file twice!", fileNames.add(key));

        InputStream is = recordReader.getCurrentValue();
        try (BufferedReader br = new BufferedReader(new InputStreamReader(is))) {
            String n;//from   w  w  w  .j  av  a  2s  . c o  m
            while ((n = br.readLine()) != null) {
                count++;
            }
        }
    }
    return count;
}

From source file:com.splout.db.hadoop.SchemaSampler.java

License:Apache License

public static Schema sample(Configuration conf, Path input, InputFormat<ITuple, NullWritable> inputFormat)
        throws IOException, InterruptedException {
    Schema schema = null;/*from   w  w w .  ja  v  a  2 s .c  o m*/

    // sample schema from input path given the provided InputFormat
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    FileInputFormat.setInputPaths(job, input);
    // get first inputSplit
    List<InputSplit> inputSplits = inputFormat.getSplits(job);
    if (inputSplits == null || inputSplits.size() == 0) {
        throw new IOException(
                "Given input format doesn't produce any input split. Can't sample first record. PATH: "
                        + input);
    }
    InputSplit inputSplit = inputSplits.get(0);
    TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);
    TaskAttemptContext attemptContext;
    try {
        attemptContext = TaskAttemptContextFactory.get(conf, attemptId);
    } catch (Exception e) {
        throw new IOException(e);
    }

    RecordReader<ITuple, NullWritable> rReader = inputFormat.createRecordReader(inputSplit, attemptContext);
    rReader.initialize(inputSplit, attemptContext);

    if (!rReader.nextKeyValue()) {
        throw new IOException(
                "Can't read first record of first input split of the given path [" + input + "].");
    }

    // finally get the sample schema
    schema = rReader.getCurrentKey().getSchema();
    log.info("Sampled schema from [" + input + "] : " + schema);
    rReader.close();

    return schema;
}