List of usage examples for org.apache.hadoop.mapreduce RecordReader getCurrentKey
public abstract KEYIN getCurrentKey() throws IOException, InterruptedException;
From source file:co.cask.cdap.template.etl.common.ETLDBInputFormat.java
License:Apache License
@Override protected RecordReader createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException { final RecordReader dbRecordReader = super.createDBRecordReader(split, conf); return new RecordReader() { @Override/*ww w .j a v a 2 s .c o m*/ public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { dbRecordReader.initialize(split, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return dbRecordReader.nextKeyValue(); } @Override public Object getCurrentKey() throws IOException, InterruptedException { return dbRecordReader.getCurrentKey(); } @Override public Object getCurrentValue() throws IOException, InterruptedException { return dbRecordReader.getCurrentValue(); } @Override public float getProgress() throws IOException, InterruptedException { return dbRecordReader.getProgress(); } @Override public void close() throws IOException { dbRecordReader.close(); try { DriverManager.deregisterDriver(driverShim); } catch (SQLException e) { throw new IOException(e); } } }; }
From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java
License:Apache License
@Test public void testOneFile() throws IOException, InterruptedException { Path dir = new Path(tempFolder.getRoot().getAbsolutePath()); CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>(); Path inputFile = new Path(dir, "file1.txt"); writeSequenceFile(inputFile);/*from ww w . jav a 2 s .co m*/ Job job = new Job(new JobConf()); FileInputFormat.addInputPath(job, inputFile); List<InputSplit> splits = inputFormat.getSplits(job); assertEquals(1, splits.size()); TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0); Configuration conf1 = new Configuration(); TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId); RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1); rr.initialize(splits.get(0), context1); assertTrue(rr.nextKeyValue()); assertEquals(key, rr.getCurrentKey()); assertEquals(value, rr.getCurrentValue()); assertFalse(rr.nextKeyValue()); assertEquals(1.0f, rr.getProgress(), 0.1); }
From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java
License:Apache License
@Test public void testTwoFiles() throws IOException, InterruptedException { Path dir = new Path(tempFolder.getRoot().getAbsolutePath()); CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>(); Path inputFile1 = new Path(dir, "file1.txt"); Path inputFile2 = new Path(dir, "file2.txt"); writeSequenceFile(inputFile1);//from w w w .ja v a 2 s . co m writeSequenceFile(inputFile2); Job job = new Job(new JobConf()); FileInputFormat.addInputPath(job, inputFile1); FileInputFormat.addInputPath(job, inputFile2); List<InputSplit> splits = inputFormat.getSplits(job); assertEquals(1, splits.size()); TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0); Configuration conf1 = new Configuration(); TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId); RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1); rr.initialize(splits.get(0), context1); assertTrue(rr.nextKeyValue()); assertEquals(key, rr.getCurrentKey()); assertEquals(value, rr.getCurrentValue()); assertEquals(0.5f, rr.getProgress(), 0.1); assertTrue(rr.nextKeyValue()); assertEquals(key, rr.getCurrentKey()); assertEquals(value, rr.getCurrentValue()); assertFalse(rr.nextKeyValue()); assertEquals(1.0f, rr.getProgress(), 0.1); }
From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample//from w w w . j a v a2 s . c om * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } @Override public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java
License:Apache License
public void testSplits(long maxSplitSize, int generatedRows) throws IOException, InterruptedException, IllegalArgumentException, SecurityException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { logger.info("Testing maxSplitSize: " + maxSplitSize + " and generatedRows:" + generatedRows); FileSystem fS = FileSystem.get(getConf()); Random r = new Random(1); Schema schema = new Schema("schema", Fields.parse("i:int,s:string")); ITuple tuple = new Tuple(schema); Path outPath = new Path(OUT); TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(getConf()), getConf(), outPath, schema); for (int i = 0; i < generatedRows; i++) { tuple.set("i", r.nextInt()); tuple.set("s", r.nextLong() + ""); writer.append(tuple);//from w w w . j ava 2s . c o m } writer.close(); TupleInputFormat format = ReflectionUtils.newInstance(TupleInputFormat.class, getConf()); Job job = new Job(getConf()); FileInputFormat.setInputPaths(job, outPath); logger.info("Using max input split size: " + maxSplitSize); FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); job.setInputFormatClass(FileInputFormat.class); // Read all the splits and count. The number of read rows must // be the same than the written ones. int count = 0; for (InputSplit split : format.getSplits(job)) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = TaskAttemptContextFactory.get(getConf(), attemptId); logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = format.createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); while (reader.nextKeyValue()) { tuple = reader.getCurrentKey(); count++; } reader.close(); } assertEquals(generatedRows, count); HadoopUtils.deleteIfExists(fS, outPath); }
From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.TestAleph2EsInputFormat.java
License:Apache License
@Test public void test_Aleph2EsRecordReader_objectConversion() throws IOException, InterruptedException { @SuppressWarnings("rawtypes") final RecordReader mock_shard_record_reader = Mockito.mock(RecordReader.class); // mock returns Text key, MapWritable value Mockito.when(mock_shard_record_reader.getCurrentKey()).thenReturn(new Text("text_test")); final MapWritable test_out = new MapWritable(); test_out.put(new Text("val_key_text"), new Text("val_val_text")); Mockito.when(mock_shard_record_reader.getCurrentValue()).thenReturn(test_out); final Aleph2EsRecordReader reader_under_test = new Aleph2EsRecordReader(mock_shard_record_reader); final String key = reader_under_test.getCurrentKey(); assertEquals(String.class, key.getClass()); assertEquals("text_test", key); final Tuple2<Long, IBatchRecord> value = reader_under_test.getCurrentValue(); assertEquals(0L, value._1().longValue()); // (so something breaks in here when/if we put some logic in) assertEquals(Optional.empty(), value._2().getContent()); final JsonNode json_val = value._2().getJson(); assertTrue("Is object: " + json_val, json_val.isObject()); assertEquals("val_val_text", json_val.get("val_key_text").asText()); assertEquals("text_test", json_val.get("_id").asText()); }
From source file:com.ikanow.aleph2.v1.document_db.hadoop.assets.TestAleph2V1InputFormat.java
License:Apache License
@Test public void test_V1DocumentDbRecordReader_objectConversion() throws IOException, InterruptedException { @SuppressWarnings("unchecked") final RecordReader<Object, BSONObject> mock_record_reader = (RecordReader<Object, BSONObject>) Mockito .mock(RecordReader.class); Mockito.when(mock_record_reader.getCurrentKey()).thenReturn("text_test"); final BasicDBObject test_ret = new BasicDBObject(); test_ret.put("val_key_text", "val_val_text"); Mockito.when(mock_record_reader.getCurrentValue()).thenReturn(test_ret); try (final V1DocumentDbRecordReader reader_under_test = new V1DocumentDbRecordReader(mock_record_reader)) { final String key = reader_under_test.getCurrentKey(); assertEquals(String.class, key.getClass()); assertEquals("text_test", key); final Tuple2<Long, IBatchRecord> value = reader_under_test.getCurrentValue(); assertEquals(0L, value._1().longValue()); // (so something breaks in here when/if we put some logic in) assertEquals(Optional.empty(), value._2().getContent()); final JsonNode json_val = value._2().getJson(); assertTrue("Is object: " + json_val, json_val.isObject()); assertEquals("val_val_text", json_val.get("val_key_text").asText()); }/* w ww . j a v a 2 s .c o m*/ }
From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * /*from w ww .j a va 2s. c o m*/ * @param job * the job to sample * @param partFile * where to write the output file to * @throws Throwable * if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.splicemachine.derby.impl.io.WholeTextInputFormatTest.java
License:Apache License
private long collectRecords(Set<String> fileNames, RecordReader<String, InputStream> recordReader) throws IOException, InterruptedException { long count = 0L; while (recordReader.nextKeyValue()) { String key = recordReader.getCurrentKey(); key = key.replaceAll("/+", "/"); // some platforms add more "/" at the beginning, coalesce them for equality check Assert.assertTrue("Seen the same file twice!", fileNames.add(key)); InputStream is = recordReader.getCurrentValue(); try (BufferedReader br = new BufferedReader(new InputStreamReader(is))) { String n;//from w w w .j av a 2s . c o m while ((n = br.readLine()) != null) { count++; } } } return count; }
From source file:com.splout.db.hadoop.SchemaSampler.java
License:Apache License
public static Schema sample(Configuration conf, Path input, InputFormat<ITuple, NullWritable> inputFormat) throws IOException, InterruptedException { Schema schema = null;/*from w w w . ja v a 2 s .c o m*/ // sample schema from input path given the provided InputFormat @SuppressWarnings("deprecation") Job job = new Job(conf); FileInputFormat.setInputPaths(job, input); // get first inputSplit List<InputSplit> inputSplits = inputFormat.getSplits(job); if (inputSplits == null || inputSplits.size() == 0) { throw new IOException( "Given input format doesn't produce any input split. Can't sample first record. PATH: " + input); } InputSplit inputSplit = inputSplits.get(0); TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext; try { attemptContext = TaskAttemptContextFactory.get(conf, attemptId); } catch (Exception e) { throw new IOException(e); } RecordReader<ITuple, NullWritable> rReader = inputFormat.createRecordReader(inputSplit, attemptContext); rReader.initialize(inputSplit, attemptContext); if (!rReader.nextKeyValue()) { throw new IOException( "Can't read first record of first input split of the given path [" + input + "]."); } // finally get the sample schema schema = rReader.getCurrentKey().getSchema(); log.info("Sampled schema from [" + input + "] : " + schema); rReader.close(); return schema; }