List of usage examples for org.apache.hadoop.io NullWritable get
public static NullWritable get()
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.WARCWriterReducerClass.java
License:Apache License
@Override protected void reduce(Text key, Iterable<WARCWritable> values, Context context) throws IOException, InterruptedException { for (WARCWritable warcWritable : values) { context.write(NullWritable.get(), warcWritable); }//from w w w . j a va2 s. c o m }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.WARCWriterReducerClass.java
License:Apache License
/** * Writes single WARCWritable to the output with specific output file prefix * * @param warcWritable warc record// w w w .ja v a2 s .c o m * @param multipleOutputs output * @throws IOException exception * @throws InterruptedException exception */ // TODO move somewhere else? public static void writeSingleWARCWritableToOutput(WARCWritable warcWritable, MultipleOutputs<NullWritable, WARCWritable> multipleOutputs) throws IOException, InterruptedException { WARCRecord.Header header = warcWritable.getRecord().getHeader(); String license = header.getField(WARCRecord.WARCRecordFieldConstants.LICENSE); String language = header.getField(WARCRecord.WARCRecordFieldConstants.LANGUAGE); String noBoilerplate = header.getField(WARCRecord.WARCRecordFieldConstants.NO_BOILERPLATE); String minimalHtml = header.getField(WARCRecord.WARCRecordFieldConstants.MINIMAL_HTML); // set the file name prefix String fileName = createOutputFilePrefix(license, language, noBoilerplate, minimalHtml); // bottleneck of single reducer for all "Lic_none_Lang_en" pages (majority of Web) // if ("en".equals(language) && LicenseDetector.NO_LICENCE.equals(license)) { // long simHash = Long // .valueOf(header.getField(WARCRecord.WARCRecordFieldConstants.SIMHASH)); // int binNumber = getBinNumberFromSimHash(simHash); // fileName = createOutputFilePrefix(license, language, noBoilerplate); // } multipleOutputs.write(NullWritable.get(), warcWritable, fileName); }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.URIExtractorTest.java
License:Apache License
@Test public void testMapper() throws IOException, InterruptedException { final String expectedURI = "https://www.ukp.tu-darmstadt.de/ukp-home/"; final WARCWritable warc = EasyMock.mock(WARCWritable.class); final WARCRecord record = EasyMock.mock(WARCRecord.class); final WARCRecord.Header header = EasyMock.mock(WARCRecord.Header.class); @SuppressWarnings("unchecked") final URIExtractor.URIExtractorMapper.Context context = EasyMock .mock(URIExtractor.URIExtractorMapper.Context.class); EasyMock.expect(record.getHeader()).andReturn(header); EasyMock.expect(warc.getRecord()).andReturn(record); EasyMock.expect(header.getTargetURI()).andReturn(expectedURI); context.write(new Text(expectedURI), NullWritable.get()); EasyMock.replay(warc, record, header, context); final URIExtractor.URIExtractorMapper mapper = new URIExtractor.URIExtractorMapper(); mapper.map(new LongWritable(0), warc, context); EasyMock.verify(warc, record, header, context); }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.URIExtractorTest.java
License:Apache License
@Test public void testReducer() throws IOException, InterruptedException { final String expectedURI = "https://www.ukp.tu-darmstadt.de/ukp-home/"; @SuppressWarnings("unchecked") final URIExtractor.URIExtractorReducer.Context context = EasyMock .mock(URIExtractor.URIExtractorReducer.Context.class); context.write(new Text(expectedURI), NullWritable.get()); @SuppressWarnings("unchecked") final Iterable<NullWritable> values = EasyMock.mock(Iterable.class); EasyMock.replay(context, values);//w ww .j a v a 2 s. co m final URIExtractor.URIExtractorReducer reducer = new URIExtractor.URIExtractorReducer(); reducer.reduce(new Text(expectedURI), values, context); EasyMock.verify(context, values); }
From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapRecordOnly.java
License:Apache License
public void map(Object unused, Text inputValue, OutputCollector<Text, NullWritable> output, Reporter reporter) throws IOException { String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]); if (noRecords == -1 || rid <= noRecords) { output.collect(inputValue, NullWritable.get()); }//from ww w . jav a2s . com }
From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapTextToRecord.java
License:Apache License
public void map(Object key, Text value, OutputCollector<Text, NullWritable> output, Reporter reporter) throws IOException { if (noRecords == -1 || rid <= noRecords) { record.set("" + (rid++) + FuzzyJoinConfig.RECORD_SEPARATOR + value.toString()); output.collect(record, NullWritable.get()); }//from w ww . j a v a 2s. com }
From source file:edu.uci.ics.pregelix.api.io.internal.InternalVertexOutputFormat.java
License:Apache License
@Override public VertexWriter<I, V, E> createVertexWriter(final TaskAttemptContext context) throws IOException, InterruptedException { return new VertexWriter<I, V, E>() { private RecordWriter recordWriter = sequenceOutputFormat.getRecordWriter(context); private NullWritable key = NullWritable.get(); @Override/*from w w w. j av a2s. c om*/ public void initialize(TaskAttemptContext context) throws IOException, InterruptedException { } @SuppressWarnings("unchecked") @Override public void writeVertex(Vertex<I, V, E, ?> vertex) throws IOException, InterruptedException { recordWriter.write(key, vertex); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { recordWriter.close(context); } }; }
From source file:edu.ucsb.cs.lsh.minhash.MinHashLshDriver.java
License:Apache License
public static void writeLsh(JobConf job, FileSystem fs, LshTable lshTable) { try {// w w w .j a v a 2 s.co m Path lshfile = new Path("lshfile"); NullWritable none = NullWritable.get(); if (fs.exists(lshfile)) fs.delete(lshfile); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, lshfile, LshTable.class, NullWritable.class, SequenceFile.CompressionType.NONE); writer.append(lshTable, none); writer.close(); DistributedCache.addCacheFile(new URI("lshfile"), job); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.umn.cs.spatialHadoop.operations.ConvexHull.java
License:Open Source License
/** * Computes the convex hull of an input file using a single machine algorithm. * The output is written to the output file. If output file is null, the * output is just thrown away./*from ww w .j a v a2 s. c o m*/ * @param inFile * @param outFile * @param params * @throws IOException * @throws InterruptedException */ public static void convexHullLocal(Path inFile, Path outFile, final OperationsParams params) throws IOException, InterruptedException { if (params.getBoolean("mem", false)) MemoryReporter.startReporting(); // 1- Split the input path/file to get splits that can be processed // independently final SpatialInputFormat3<Rectangle, Point> inputFormat = new SpatialInputFormat3<Rectangle, Point>(); Job job = Job.getInstance(params); SpatialInputFormat3.setInputPaths(job, inFile); final List<InputSplit> splits = inputFormat.getSplits(job); // 2- Read all input points in memory LOG.info("Reading points from " + splits.size() + " splits"); List<Point[]> allLists = Parallel.forEach(splits.size(), new RunnableRange<Point[]>() { @Override public Point[] run(int i1, int i2) { try { List<Point> finalPoints = new ArrayList<Point>(); final int MaxSize = 100000; Point[] points = new Point[MaxSize]; int size = 0; for (int i = i1; i < i2; i++) { org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits .get(i); final RecordReader<Rectangle, Iterable<Point>> reader = inputFormat .createRecordReader(fsplit, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplit, params); } else { throw new RuntimeException("Unknown record reader"); } while (reader.nextKeyValue()) { Iterable<Point> pts = reader.getCurrentValue(); for (Point p : pts) { points[size++] = p.clone(); if (size >= points.length) { // Perform convex hull and write the result to finalPoints Point[] chPoints = convexHullInMemory(points); for (Point skylinePoint : chPoints) finalPoints.add(skylinePoint); size = 0; // reset } } } reader.close(); } while (size-- > 0) finalPoints.add(points[size]); return finalPoints.toArray(new Point[finalPoints.size()]); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } return null; } }, params.getInt("parallel", Runtime.getRuntime().availableProcessors())); int totalNumPoints = 0; for (Point[] list : allLists) totalNumPoints += list.length; LOG.info("Read " + totalNumPoints + " points and merging into one list"); Point[] allPoints = new Point[totalNumPoints]; int pointer = 0; for (Point[] list : allLists) { System.arraycopy(list, 0, allPoints, pointer, list.length); pointer += list.length; } allLists.clear(); // To the let the GC collect it Point[] ch = convexHullInMemory(allPoints); if (outFile != null) { if (params.getBoolean("overwrite", false)) { FileSystem outFs = outFile.getFileSystem(new Configuration()); outFs.delete(outFile, true); } GridRecordWriter<Point> out = new GridRecordWriter<Point>(outFile, null, null, null); for (Point pt : ch) { out.write(NullWritable.get(), pt); } out.close(null); } }
From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java
License:Open Source License
public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, CellInfo[] cells, OperationsParams params) throws IOException { String sindex = params.get("sindex"); Shape shape = params.getShape("shape"); JobConf job = new JobConf(params, Repartition.class); ShapeRecordWriter<Shape> writer; if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) { writer = new GridRecordWriter<Shape>(outFile, job, null, cells); } else if (sindex.equals("rtree") || sindex.equals("r+tree")) { writer = new RTreeGridRecordWriter<Shape>(outFile, job, null, cells); writer.setStockObject(shape);//from www .j av a2 s . c om } else { throw new RuntimeException("Unupoorted spatial idnex: " + sindex); } // Read input file(s) FileInputFormat.addInputPath(job, inFile); ShapeInputFormat<S> inputFormat = new ShapeInputFormat<S>(); InputSplit[] splits = inputFormat.getSplits(job, 1); for (InputSplit split : splits) { ShapeRecordReader<Shape> reader = new ShapeRecordReader<Shape>(params, (FileSplit) split); Rectangle c = reader.createKey(); while (reader.next(c, shape)) { if (shape.getMBR() != null) writer.write(NullWritable.get(), shape); } reader.close(); } writer.close(null); }