Example usage for org.apache.hadoop.io BytesWritable getBytes

List of usage examples for org.apache.hadoop.io BytesWritable getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io BytesWritable getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Get the data backing the BytesWritable.

Usage

From source file:org.opensextant.mapreduce.GeoTaggerMapper.java

License:Apache License

/**
 * /*ww w .j av a  2  s  .  c o m*/
 */
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
    String text = null;
    HashSet<String> dedup = new HashSet<>();

    try {
        JSONObject obj = JSONObject.fromObject(textRecord.toString());
        if (!obj.containsKey("text")) {
            return;
        }
        String text_id = key.toString();
        text = obj.getString("text");
        TextInput textObj = new TextInput(text_id, text);
        textObj.langid = "en";
        /* LANG ID = 'ENGLISH',
         * If this is not true, then you need to add LangID to your metadata or detect it live
         */

        List<TextMatch> matches = geocoder.extract(textObj);

        if (matches.isEmpty()) {
            return;
        }

        /* NORMALIZE findings.
         * Reduce all matches, minimizing duplicates, removing whitespace, etc.
         *
         */
        int filtered = 0, duplicates = 0;
        for (TextMatch tm : matches) {

            //                if (filterCrap(tm.getText())) {
            //                    filtered += 1;
            //                    continue;
            //                }
            if (dedup.contains(tm.getText())) {
                duplicates += 1;
                continue;
            }
            dedup.add(tm.getText());
            JSONObject o = match2JSON(tm);
            Text matchOutput = new Text(o.toString());
            context.write(NullWritable.get(), matchOutput);
        }
        if (log.isTraceEnabled()) {
            log.trace("For key " + new String(key.getBytes(), StandardCharsets.UTF_8) + " found "
                    + matches.size() + ", filtered: " + filtered + " as junk, " + duplicates + " duplicates.");
        }
    } catch (Exception err) {
        log.error("Error running geotagger", err);
    }
}

From source file:org.opensextant.mapreduce.KeywordTaggerMapper.java

License:Apache License

/**
 * /*from  w w w . ja v a 2 s .co  m*/
 */
@Override
public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException {
    ++counter;
    String text = null;
    HashSet<String> dedup = new HashSet<>();

    try {
        JSONObject obj = JSONObject.fromObject(textRecord.toString());
        if (!obj.containsKey("text")) {
            return;
        }
        String text_id = key.toString();
        text = obj.getString("text");
        TextInput textObj = new TextInput(text_id, text);
        textObj.langid = "en";
        /* LANG ID = 'ENGLISH', 
         * If this is not true, then you need to add LangID to your metadata or detect it live 
         */

        /*
         * Testing to see if XTax tagger operates in Hadoop job
         */
        List<TextMatch> matches = xtax.extract(textObj);

        if (matches.isEmpty()) {
            return;
        }

        /* NORMALIZE findings.
         * Reduce all matches, minimizing duplicates, removing whitespace, etc.
         * 
         */
        int filtered = 0, duplicates = 0;
        for (TextMatch tm : matches) {
            if (filterCrap(tm.getText())) {
                filtered += 1;
                continue;
            }
            if (dedup.contains(tm.getText())) {
                duplicates += 1;
                continue;
            }
            dedup.add(tm.getText());
            JSONObject o = match2JSON(tm);
            Text matchOutput = new Text(o.toString());
            context.write(NullWritable.get(), matchOutput);
        }
        if (log.isTraceEnabled()) {
            log.trace("For key " + new String(key.getBytes(), StandardCharsets.UTF_8) + " found "
                    + matches.size() + ", filtered: " + filtered + " as junk, " + duplicates + " duplicates.");
        }
    } catch (Exception err) {
        log.error("Error running xtax", err);
        // System.exit(-1);
    }
}

From source file:org.pentaho.hadoop.mapreduce.converter.converters.BytesWritableToByteArrayConverter.java

License:Apache License

@Override
public byte[] convert(ValueMetaInterface meta, BytesWritable obj) throws TypeConversionException {
    return obj.getBytes().clone();
}

From source file:org.sleuthkit.hadoop.scoring.CrossImageScoreReducer.java

License:Open Source License

@Override
public void reduce(BytesWritable fileHash, Iterable<BytesWritable> imgIDs, Context context)
        throws IOException, InterruptedException {
    boolean inThisImage = false;
    BytesArrayWritable aw = new BytesArrayWritable();
    HashSet<Writable> valueList = new HashSet<Writable>();
    for (BytesWritable curImgID : imgIDs) {
        if (belongsToImage(curImgID.getBytes())) {
            System.out.println("Hashes equal: " + new String(Hex.encodeHex(curImgID.getBytes())) + " to "
                    + Hex.encodeHexString(ourImageID));
            inThisImage = true;//from   ww w .  j  a v a2 s .com
        }
        valueList.add(new BytesWritable(curImgID.getBytes().clone()));
    }

    if (inThisImage) {
        aw.set(valueList.toArray(new BytesWritable[0]));
        context.write(fileHash, aw);
        System.out.println("Done Writing Context.");
    }
}

From source file:org.tensorflow.hadoop.io.TFRecordFileOutputFormat.java

License:Open Source License

@Override
public RecordWriter<BytesWritable, NullWritable> getRecordWriter(TaskAttemptContext context)
        throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    Path file = getDefaultWorkFile(context, "");
    FileSystem fs = file.getFileSystem(conf);

    int bufferSize = TFRecordIOConf.getBufferSize(conf);
    final FSDataOutputStream fsdos = fs.create(file, true, bufferSize);
    final TFRecordWriter writer = new TFRecordWriter(fsdos);
    return new RecordWriter<BytesWritable, NullWritable>() {
        @Override/*w ww.  j a v a 2s  . co  m*/
        public void write(BytesWritable key, NullWritable value) throws IOException, InterruptedException {
            writer.write(key.getBytes());
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            fsdos.close();
        }
    };
}

From source file:org.tensorflow.hadoop.io.TFRecordFileTest.java

License:Open Source License

@Test
public void testInputOutputFormat() throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);//from w  w  w  . ja v a2 s  . c om

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "tfr-test");

    TFRecordFileOutputFormat.setOutputPath(job, outdir);

    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, NullWritable> outputFormat = new TFRecordFileOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, NullWritable> writer = outputFormat.getRecordWriter(context);

    // Write Example with random numbers
    Random rand = new Random();
    Map<Long, Long> records = new TreeMap<Long, Long>();
    try {
        for (int i = 0; i < RECORDS; ++i) {
            long randValue = rand.nextLong();
            records.put((long) i, randValue);
            Int64List data = Int64List.newBuilder().addValue(i).addValue(randValue).build();
            Feature feature = Feature.newBuilder().setInt64List(data).build();
            Features features = Features.newBuilder().putFeature("data", feature).build();
            Example example = Example.newBuilder().setFeatures(features).build();
            BytesWritable key = new BytesWritable(example.toByteArray());
            writer.write(key, NullWritable.get());
        }
    } finally {
        writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    // Read and compare
    TFRecordFileInputFormat.setInputPaths(job, outdir);
    InputFormat<BytesWritable, NullWritable> inputFormat = new TFRecordFileInputFormat();
    for (InputSplit split : inputFormat.getSplits(job)) {
        RecordReader<BytesWritable, NullWritable> reader = inputFormat.createRecordReader(split, context);
        MapContext<BytesWritable, NullWritable, BytesWritable, NullWritable> mcontext = new MapContextImpl<BytesWritable, NullWritable, BytesWritable, NullWritable>(
                job.getConfiguration(), context.getTaskAttemptID(), reader, null, null,
                MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        try {
            while (reader.nextKeyValue()) {
                BytesWritable bytes = reader.getCurrentKey();
                Example example = Example.parseFrom(bytes.getBytes());
                Int64List data = example.getFeatures().getFeatureMap().get("data").getInt64List();
                Long key = data.getValue(0);
                Long value = data.getValue(1);
                assertEquals(records.get(key), value);
                records.remove(key);
            }
        } finally {
            reader.close();
        }
    }
    assertEquals(0, records.size());
}

From source file:org.tensorflow.hadoop.io.WholeFileOutputFormat.java

License:Open Source License

@Override
public RecordWriter<BytesWritable, NullWritable> getRecordWriter(TaskAttemptContext context)
        throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    Path file = getDefaultWorkFile(context, "");
    FileSystem fs = file.getFileSystem(conf);

    int bufferSize = conf.getInt("io.file.buffer.size", 4096);
    final FSDataOutputStream fsdos = fs.create(file, true, bufferSize);
    return new RecordWriter<BytesWritable, NullWritable>() {
        @Override/* w ww. j  a  va2 s. c  o m*/
        public void write(BytesWritable key, NullWritable value) throws IOException, InterruptedException {
            fsdos.write(key.getBytes());
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            fsdos.close();
        }
    };
}

From source file:parquet.hadoop.thrift.ThriftBytesWriteSupport.java

License:Apache License

private TProtocol protocol(BytesWritable record) {
    TProtocol protocol = protocolFactory
            .getProtocol(new TIOStreamTransport(new ByteArrayInputStream(record.getBytes())));

    /* Reduce the chance of OOM when data is corrupted. When readBinary is called on TBinaryProtocol, it reads the length of the binary first,
     so if the data is corrupted, it could read a big integer as the length of the binary and therefore causes OOM to happen.
     Currently this fix only applies to TBinaryProtocol which has the setReadLength defined.
      */// w  ww .  j  ava 2 s. co  m
    if (protocol instanceof TBinaryProtocol) {
        ((TBinaryProtocol) protocol).setReadLength(record.getLength());
    }
    return protocol;
}

From source file:protobuf.examples.ProtobufMapper.java

License:Open Source License

public void map(LongWritable key, BytesWritable value, OutputCollector<Text, IntWritable> output,
        Reporter reporter) throws IOException {

    LOG.info("In Mapper Get Data: " + value.toString());

    int bufferSize = value.getLength();
    byte buffer[] = new byte[bufferSize];
    System.arraycopy(value.getBytes(), 0, buffer, 0, bufferSize);

    output.collect(new Text("msg.getEmail()"), new IntWritable(1));
}

From source file:shark.io.MutableBytesWritable.java

License:Apache License

/**
 * Set the BytesWritable to the contents of the given newData.
 * @param newData the value to set this BytesWritable to.
 *//*from w  w  w . j a  v  a2  s  .com*/
public void set(BytesWritable newData) {
    set(newData.getBytes(), 0, newData.getLength());
}