Example usage for org.apache.hadoop.io NullWritable get

List of usage examples for org.apache.hadoop.io NullWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io NullWritable get.

Prototype

public static NullWritable get() 

Source Link

Document

Returns the single instance of this class.

Usage

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.WARCWriterReducerClass.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<WARCWritable> values, Context context)
        throws IOException, InterruptedException {
    for (WARCWritable warcWritable : values) {
        context.write(NullWritable.get(), warcWritable);
    }//from   w  w w . j a  va2 s. c o  m
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.WARCWriterReducerClass.java

License:Apache License

/**
 * Writes single WARCWritable to the output with specific output file prefix
 *
 * @param warcWritable    warc record//  w  w  w  .ja  v  a2  s .c  o  m
 * @param multipleOutputs output
 * @throws IOException          exception
 * @throws InterruptedException exception
 */
// TODO move somewhere else?
public static void writeSingleWARCWritableToOutput(WARCWritable warcWritable,
        MultipleOutputs<NullWritable, WARCWritable> multipleOutputs) throws IOException, InterruptedException {
    WARCRecord.Header header = warcWritable.getRecord().getHeader();
    String license = header.getField(WARCRecord.WARCRecordFieldConstants.LICENSE);
    String language = header.getField(WARCRecord.WARCRecordFieldConstants.LANGUAGE);
    String noBoilerplate = header.getField(WARCRecord.WARCRecordFieldConstants.NO_BOILERPLATE);
    String minimalHtml = header.getField(WARCRecord.WARCRecordFieldConstants.MINIMAL_HTML);

    // set the file name prefix
    String fileName = createOutputFilePrefix(license, language, noBoilerplate, minimalHtml);

    // bottleneck of single reducer for all "Lic_none_Lang_en" pages (majority of Web)
    //        if ("en".equals(language) && LicenseDetector.NO_LICENCE.equals(license)) {
    //            long simHash = Long
    //                    .valueOf(header.getField(WARCRecord.WARCRecordFieldConstants.SIMHASH));
    //            int binNumber = getBinNumberFromSimHash(simHash);
    //            fileName = createOutputFilePrefix(license, language, noBoilerplate);
    //        }

    multipleOutputs.write(NullWritable.get(), warcWritable, fileName);
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.URIExtractorTest.java

License:Apache License

@Test
public void testMapper() throws IOException, InterruptedException {
    final String expectedURI = "https://www.ukp.tu-darmstadt.de/ukp-home/";

    final WARCWritable warc = EasyMock.mock(WARCWritable.class);
    final WARCRecord record = EasyMock.mock(WARCRecord.class);
    final WARCRecord.Header header = EasyMock.mock(WARCRecord.Header.class);

    @SuppressWarnings("unchecked")
    final URIExtractor.URIExtractorMapper.Context context = EasyMock
            .mock(URIExtractor.URIExtractorMapper.Context.class);

    EasyMock.expect(record.getHeader()).andReturn(header);
    EasyMock.expect(warc.getRecord()).andReturn(record);
    EasyMock.expect(header.getTargetURI()).andReturn(expectedURI);
    context.write(new Text(expectedURI), NullWritable.get());
    EasyMock.replay(warc, record, header, context);

    final URIExtractor.URIExtractorMapper mapper = new URIExtractor.URIExtractorMapper();
    mapper.map(new LongWritable(0), warc, context);

    EasyMock.verify(warc, record, header, context);
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.URIExtractorTest.java

License:Apache License

@Test
public void testReducer() throws IOException, InterruptedException {
    final String expectedURI = "https://www.ukp.tu-darmstadt.de/ukp-home/";

    @SuppressWarnings("unchecked")
    final URIExtractor.URIExtractorReducer.Context context = EasyMock
            .mock(URIExtractor.URIExtractorReducer.Context.class);
    context.write(new Text(expectedURI), NullWritable.get());

    @SuppressWarnings("unchecked")
    final Iterable<NullWritable> values = EasyMock.mock(Iterable.class);

    EasyMock.replay(context, values);//w  ww  .j  a v a 2  s. co  m

    final URIExtractor.URIExtractorReducer reducer = new URIExtractor.URIExtractorReducer();

    reducer.reduce(new Text(expectedURI), values, context);

    EasyMock.verify(context, values);
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapRecordOnly.java

License:Apache License

public void map(Object unused, Text inputValue, OutputCollector<Text, NullWritable> output, Reporter reporter)
        throws IOException {
    String splits[] = inputValue.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
    int rid = Integer.valueOf(splits[FuzzyJoinConfig.RECORD_KEY]);
    if (noRecords == -1 || rid <= noRecords) {
        output.collect(inputValue, NullWritable.get());
    }//from ww  w .  jav  a2s  .  com
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapTextToRecord.java

License:Apache License

public void map(Object key, Text value, OutputCollector<Text, NullWritable> output, Reporter reporter)
        throws IOException {
    if (noRecords == -1 || rid <= noRecords) {
        record.set("" + (rid++) + FuzzyJoinConfig.RECORD_SEPARATOR + value.toString());
        output.collect(record, NullWritable.get());
    }//from   w  ww  .  j a v a 2s.  com
}

From source file:edu.uci.ics.pregelix.api.io.internal.InternalVertexOutputFormat.java

License:Apache License

@Override
public VertexWriter<I, V, E> createVertexWriter(final TaskAttemptContext context)
        throws IOException, InterruptedException {
    return new VertexWriter<I, V, E>() {
        private RecordWriter recordWriter = sequenceOutputFormat.getRecordWriter(context);
        private NullWritable key = NullWritable.get();

        @Override/*from w  w w.  j  av  a2s. c  om*/
        public void initialize(TaskAttemptContext context) throws IOException, InterruptedException {

        }

        @SuppressWarnings("unchecked")
        @Override
        public void writeVertex(Vertex<I, V, E, ?> vertex) throws IOException, InterruptedException {
            recordWriter.write(key, vertex);
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            recordWriter.close(context);
        }

    };
}

From source file:edu.ucsb.cs.lsh.minhash.MinHashLshDriver.java

License:Apache License

public static void writeLsh(JobConf job, FileSystem fs, LshTable lshTable) {
    try {//  w  w  w .j  a  v  a  2 s.co  m
        Path lshfile = new Path("lshfile");
        NullWritable none = NullWritable.get();
        if (fs.exists(lshfile))
            fs.delete(lshfile);
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, lshfile, LshTable.class,
                NullWritable.class, SequenceFile.CompressionType.NONE);
        writer.append(lshTable, none);
        writer.close();
        DistributedCache.addCacheFile(new URI("lshfile"), job);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.ConvexHull.java

License:Open Source License

/**
 * Computes the convex hull of an input file using a single machine algorithm.
 * The output is written to the output file. If output file is null, the
 * output is just thrown away./*from  ww w  .j a v a2  s. c  o  m*/
 * @param inFile
 * @param outFile
 * @param params
 * @throws IOException
 * @throws InterruptedException
 */
public static void convexHullLocal(Path inFile, Path outFile, final OperationsParams params)
        throws IOException, InterruptedException {
    if (params.getBoolean("mem", false))
        MemoryReporter.startReporting();
    // 1- Split the input path/file to get splits that can be processed
    // independently
    final SpatialInputFormat3<Rectangle, Point> inputFormat = new SpatialInputFormat3<Rectangle, Point>();
    Job job = Job.getInstance(params);
    SpatialInputFormat3.setInputPaths(job, inFile);
    final List<InputSplit> splits = inputFormat.getSplits(job);

    // 2- Read all input points in memory
    LOG.info("Reading points from " + splits.size() + " splits");
    List<Point[]> allLists = Parallel.forEach(splits.size(), new RunnableRange<Point[]>() {
        @Override
        public Point[] run(int i1, int i2) {
            try {
                List<Point> finalPoints = new ArrayList<Point>();
                final int MaxSize = 100000;
                Point[] points = new Point[MaxSize];
                int size = 0;
                for (int i = i1; i < i2; i++) {
                    org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits
                            .get(i);
                    final RecordReader<Rectangle, Iterable<Point>> reader = inputFormat
                            .createRecordReader(fsplit, null);
                    if (reader instanceof SpatialRecordReader3) {
                        ((SpatialRecordReader3) reader).initialize(fsplit, params);
                    } else if (reader instanceof RTreeRecordReader3) {
                        ((RTreeRecordReader3) reader).initialize(fsplit, params);
                    } else if (reader instanceof HDFRecordReader) {
                        ((HDFRecordReader) reader).initialize(fsplit, params);
                    } else {
                        throw new RuntimeException("Unknown record reader");
                    }
                    while (reader.nextKeyValue()) {
                        Iterable<Point> pts = reader.getCurrentValue();
                        for (Point p : pts) {
                            points[size++] = p.clone();
                            if (size >= points.length) {
                                // Perform convex hull and write the result to finalPoints
                                Point[] chPoints = convexHullInMemory(points);
                                for (Point skylinePoint : chPoints)
                                    finalPoints.add(skylinePoint);
                                size = 0; // reset
                            }
                        }
                    }
                    reader.close();
                }
                while (size-- > 0)
                    finalPoints.add(points[size]);
                return finalPoints.toArray(new Point[finalPoints.size()]);
            } catch (IOException e) {
                e.printStackTrace();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            return null;
        }
    }, params.getInt("parallel", Runtime.getRuntime().availableProcessors()));

    int totalNumPoints = 0;
    for (Point[] list : allLists)
        totalNumPoints += list.length;

    LOG.info("Read " + totalNumPoints + " points and merging into one list");
    Point[] allPoints = new Point[totalNumPoints];
    int pointer = 0;

    for (Point[] list : allLists) {
        System.arraycopy(list, 0, allPoints, pointer, list.length);
        pointer += list.length;
    }
    allLists.clear(); // To the let the GC collect it

    Point[] ch = convexHullInMemory(allPoints);

    if (outFile != null) {
        if (params.getBoolean("overwrite", false)) {
            FileSystem outFs = outFile.getFileSystem(new Configuration());
            outFs.delete(outFile, true);
        }
        GridRecordWriter<Point> out = new GridRecordWriter<Point>(outFile, null, null, null);
        for (Point pt : ch) {
            out.write(NullWritable.get(), pt);
        }
        out.close(null);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

public static <S extends Shape> void repartitionLocal(Path inFile, Path outFile, CellInfo[] cells,
        OperationsParams params) throws IOException {
    String sindex = params.get("sindex");
    Shape shape = params.getShape("shape");
    JobConf job = new JobConf(params, Repartition.class);

    ShapeRecordWriter<Shape> writer;
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        writer = new GridRecordWriter<Shape>(outFile, job, null, cells);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        writer = new RTreeGridRecordWriter<Shape>(outFile, job, null, cells);
        writer.setStockObject(shape);//from   www .j av a2  s .  c om
    } else {
        throw new RuntimeException("Unupoorted spatial idnex: " + sindex);
    }

    // Read input file(s)
    FileInputFormat.addInputPath(job, inFile);
    ShapeInputFormat<S> inputFormat = new ShapeInputFormat<S>();
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    for (InputSplit split : splits) {
        ShapeRecordReader<Shape> reader = new ShapeRecordReader<Shape>(params, (FileSplit) split);
        Rectangle c = reader.createKey();

        while (reader.next(c, shape)) {
            if (shape.getMBR() != null)
                writer.write(NullWritable.get(), shape);
        }
        reader.close();
    }
    writer.close(null);
}