List of usage examples for org.apache.hadoop.io NullWritable get
public static NullWritable get()
From source file:com.hotels.corc.mapred.CorcInputFormatTest.java
License:Apache License
@Test public void readFullyReadSchemaFromSplit() throws IOException { StructTypeInfo typeInfo = new StructTypeInfoBuilder().add("a", TypeInfoFactory.stringTypeInfo) .add("b", TypeInfoFactory.stringTypeInfo).build(); CorcInputFormat.setTypeInfo(conf, typeInfo); CorcInputFormat.setConverterFactoryClass(conf, DefaultConverterFactory.class); RecordReader<NullWritable, Corc> reader = inputFormat.getRecordReader(split, conf, reporter); Corc corc = reader.createValue();//from w w w.j a v a2s .c o m reader.next(NullWritable.get(), corc); assertThat(corc.get("a"), is((Object) "A1")); assertThat(corc.get("b"), is((Object) "B1")); reader.close(); }
From source file:com.hotels.corc.mapred.CorcInputFormatTest.java
License:Apache License
@Test public void readFullyDeclaredSchema() throws IOException { StructTypeInfo typeInfo = new StructTypeInfoBuilder().add("a", TypeInfoFactory.stringTypeInfo) .add("b", TypeInfoFactory.stringTypeInfo).build(); CorcInputFormat.setTypeInfo(conf, typeInfo); CorcInputFormat.setSchemaTypeInfo(conf, typeInfo); CorcInputFormat.setConverterFactoryClass(conf, DefaultConverterFactory.class); RecordReader<NullWritable, Corc> reader = inputFormat.getRecordReader(split, conf, reporter); Corc corc = reader.createValue();//from w w w. ja va 2s . c om reader.next(NullWritable.get(), corc); assertThat(corc.get("a"), is((Object) "A1")); assertThat(corc.get("b"), is((Object) "B1")); reader.close(); }
From source file:com.hotels.corc.mapred.CorcOutputFormatTest.java
License:Apache License
@Test public void writer() throws IOException { File root = temporaryFolder.getRoot(); conf.set("mapreduce.output.fileoutputformat.outputdir", root.getCanonicalPath()); conf.set("mapreduce.task.attempt.id", "attempt_x_0001_m_000001_1"); String name = "name"; RecordWriter<NullWritable, Corc> writer = outputFormat.getRecordWriter(fileSystem, conf, name, progress); StructTypeInfo typeInfo = new StructTypeInfoBuilder().add("a", TypeInfoFactory.stringTypeInfo).build(); Corc corc = new Corc(typeInfo, new DefaultConverterFactory()); corc.set("a", "value"); writer.write(NullWritable.get(), corc); writer.close(reporter);/* w ww .j a va2s .c o m*/ Path path = new Path(root.getCanonicalPath() + "/_temporary/0/_temporary/attempt_x_0001_m_000001_1/name"); try (OrcReader reader = new OrcReader(conf, path)) { List<Object> next = reader.next(); assertThat(next.size(), is(1)); assertThat(next.get(0), is((Object) "value")); assertFalse(reader.hasNext()); } ; }
From source file:com.hotels.corc.mapred.CorcRecordReaderTest.java
License:Apache License
@Test public void readerNext() throws IOException { @SuppressWarnings("unchecked") RecordReader<NullWritable, OrcStruct> recordReader = mock(RecordReader.class); CorcRecordReader reader = new CorcRecordReader(typeInfo, recordReader, factory); Corc corc = mock(Corc.class); when(recordReader.next(any(NullWritable.class), any(OrcStruct.class))).thenReturn(true); boolean next = reader.next(NullWritable.get(), corc); assertTrue(next);//from w w w. j a v a2s. c o m verify(corc, never()).setRecordIdentifier(any(RecordIdentifier.class)); }
From source file:com.hotels.corc.mapred.CorcRecordReaderTest.java
License:Apache License
@Test public void readerNoNext() throws IOException { @SuppressWarnings("unchecked") RecordReader<NullWritable, OrcStruct> recordReader = mock(RecordReader.class); CorcRecordReader reader = new CorcRecordReader(typeInfo, recordReader, factory); Corc corc = mock(Corc.class); when(recordReader.next(any(NullWritable.class), any(OrcStruct.class))).thenReturn(false); boolean next = reader.next(NullWritable.get(), corc); assertFalse(next);//w w w . j a va2 s.c o m verify(corc, never()).setRecordIdentifier(any(RecordIdentifier.class)); }
From source file:com.hotels.corc.mapred.CorcRecordReaderTest.java
License:Apache License
@Test public void readerNextTransactional() throws IOException { @SuppressWarnings("unchecked") AcidRecordReader<NullWritable, OrcStruct> recordReader = mock(AcidRecordReader.class); CorcRecordReader reader = new CorcRecordReader(typeInfo, recordReader, factory); Corc corc = mock(Corc.class); when(recordReader.next(any(NullWritable.class), any(OrcStruct.class))).thenReturn(true); boolean next = reader.next(NullWritable.get(), corc); assertTrue(next);// www . jav a2 s . c o m verify(corc).setRecordIdentifier(any(RecordIdentifier.class)); }
From source file:com.hotels.corc.mapred.CorcRecordReaderTest.java
License:Apache License
@Test public void readerNoNextTransactional() throws IOException { @SuppressWarnings("unchecked") AcidRecordReader<NullWritable, OrcStruct> recordReader = mock(AcidRecordReader.class); CorcRecordReader reader = new CorcRecordReader(typeInfo, recordReader, factory); Corc corc = mock(Corc.class); when(recordReader.next(any(NullWritable.class), any(OrcStruct.class))).thenReturn(false); boolean next = reader.next(NullWritable.get(), corc); assertFalse(next);// w w w . j a va 2s . c om verify(corc, never()).setRecordIdentifier(any(RecordIdentifier.class)); }
From source file:com.htuple.SecondarySortTest.java
License:Apache License
@Test public void testSecondarySort() throws IOException { Configuration conf = mapReduceDriver.getConfiguration(); SecondarySort.setupSecondarySort(conf); LongWritable dummyMapKey = new LongWritable(1); mapReduceDriver.withInput(dummyMapKey, new Text("Smith\tJohn\n")) .withInput(dummyMapKey, new Text("Smith\tAnne\n")).withInput(dummyMapKey, new Text("Smith\tKen\n")) .withOutput(new Text("Smith\tAnne\n"), NullWritable.get()) .withOutput(new Text("Smith\tJohn\n"), NullWritable.get()) .withOutput(new Text("Smith\tKen\n"), NullWritable.get()).runTest(true); }
From source file:com.hyperiongray.ccmr.s3wordcount.WordCountOnlyMapper.java
License:Apache License
public void map(Object key, Text value, OutputCollector<NullWritable, Text> outputCollector, Reporter reporter) throws IOException { // We're accessing a publicly available bucket so don't need to fill in // our credentials ArchiveReader ar;//ww w .j a va 2 s . c o m try { S3Service s3s = new RestS3Service(null); // Let's grab a file out of the CommonCrawl S3 bucket String fn = value.toString(); logger.info(fn); S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null); // The file name identifies the ArchiveReader and indicates if it // should be decompressed ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true); } catch (ServiceException e) { logger.error("S3 connection Failed", e); throw new RuntimeException(e); } // Once we have an ArchiveReader, we can work through each of the // records it contains int i = 0; logger.info("Started" + new Date()); for (ArchiveRecord r : ar) { reporter.progress(); String url = ""; try { // The header file contains information such as the type of // record, size, creation time, and URL url = r.getHeader().getUrl(); String crawledDate = r.getHeader().getDate(); if (url == null) continue; // If we want to read the contents of the record, we can use the // ArchiveRecord as an InputStream // Create a byte array that is as long as all the record's // stated length OutputStream os = new ByteArrayOutputStream(); try { r.dump(os); } finally { try { if (r != null) r.close(); } catch (Exception e) { logger.error("reading inputstream Failed", e); } } // Note: potential optimization would be to have a large buffer // only allocated once // Why don't we convert it to a string and print the start of // it? String content = new String(os.toString()); Map<String, Integer> matches = contentMatcher.matchContent(content); int score = contentMatcher.score(matches); if (score > LOWER_SCORE_THRESHOLD) { logger.info("****************************************"); logger.info("URL: " + url + " Score: " + score + " Detail: " + matches); // outputCollector.collect(new IntWritable(score), new Text(url)); outputCollector.collect(NullWritable.get(), new Text(outputParser .parse(contentMatcher.getTitle(content), url, crawledDate, score, matches))); } logger.debug(new Integer(i).toString()); if (i++ > sampleSize) { logger.info("Finished " + new Date()); break; } } catch (Exception e) { logger.error("url failed " + url, e); } } }
From source file:com.ibm.bi.dml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Open Source License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample/*from w w w. j a v a2s . c o m*/ * @param partFile where to write the output file to * @throws IOException if something goes wrong * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; ++i) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int index0 = -1, i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; writer.close(); return index0; }