List of usage examples for org.apache.hadoop.io BytesWritable toString
@Override
public String toString()
From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java
License:Apache License
private static String textifyBytes(Text t) { BytesWritable b = new BytesWritable(); b.set(t.getBytes(), 0, t.getLength()); return b.toString(); }
From source file:org.opensextant.mapreduce.GeoTaggerMapper.java
License:Apache License
/** * /*from w w w .j a va2 s . c om*/ */ @Override public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException { String text = null; HashSet<String> dedup = new HashSet<>(); try { JSONObject obj = JSONObject.fromObject(textRecord.toString()); if (!obj.containsKey("text")) { return; } String text_id = key.toString(); text = obj.getString("text"); TextInput textObj = new TextInput(text_id, text); textObj.langid = "en"; /* LANG ID = 'ENGLISH', * If this is not true, then you need to add LangID to your metadata or detect it live */ List<TextMatch> matches = geocoder.extract(textObj); if (matches.isEmpty()) { return; } /* NORMALIZE findings. * Reduce all matches, minimizing duplicates, removing whitespace, etc. * */ int filtered = 0, duplicates = 0; for (TextMatch tm : matches) { // if (filterCrap(tm.getText())) { // filtered += 1; // continue; // } if (dedup.contains(tm.getText())) { duplicates += 1; continue; } dedup.add(tm.getText()); JSONObject o = match2JSON(tm); Text matchOutput = new Text(o.toString()); context.write(NullWritable.get(), matchOutput); } if (log.isTraceEnabled()) { log.trace("For key " + new String(key.getBytes(), StandardCharsets.UTF_8) + " found " + matches.size() + ", filtered: " + filtered + " as junk, " + duplicates + " duplicates."); } } catch (Exception err) { log.error("Error running geotagger", err); } }
From source file:org.opensextant.mapreduce.KeywordTaggerMapper.java
License:Apache License
/** * /*from ww w .j ava 2 s. co m*/ */ @Override public void map(BytesWritable key, Text textRecord, Context context) throws IOException, InterruptedException { ++counter; String text = null; HashSet<String> dedup = new HashSet<>(); try { JSONObject obj = JSONObject.fromObject(textRecord.toString()); if (!obj.containsKey("text")) { return; } String text_id = key.toString(); text = obj.getString("text"); TextInput textObj = new TextInput(text_id, text); textObj.langid = "en"; /* LANG ID = 'ENGLISH', * If this is not true, then you need to add LangID to your metadata or detect it live */ /* * Testing to see if XTax tagger operates in Hadoop job */ List<TextMatch> matches = xtax.extract(textObj); if (matches.isEmpty()) { return; } /* NORMALIZE findings. * Reduce all matches, minimizing duplicates, removing whitespace, etc. * */ int filtered = 0, duplicates = 0; for (TextMatch tm : matches) { if (filterCrap(tm.getText())) { filtered += 1; continue; } if (dedup.contains(tm.getText())) { duplicates += 1; continue; } dedup.add(tm.getText()); JSONObject o = match2JSON(tm); Text matchOutput = new Text(o.toString()); context.write(NullWritable.get(), matchOutput); } if (log.isTraceEnabled()) { log.trace("For key " + new String(key.getBytes(), StandardCharsets.UTF_8) + " found " + matches.size() + ", filtered: " + filtered + " as junk, " + duplicates + " duplicates."); } } catch (Exception err) { log.error("Error running xtax", err); // System.exit(-1); } }
From source file:protobuf.examples.ProtobufMapper.java
License:Open Source License
public void map(LongWritable key, BytesWritable value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { LOG.info("In Mapper Get Data: " + value.toString()); int bufferSize = value.getLength(); byte buffer[] = new byte[bufferSize]; System.arraycopy(value.getBytes(), 0, buffer, 0, bufferSize); output.collect(new Text("msg.getEmail()"), new IntWritable(1)); }
From source file:weka.distributed.hadoop.CorrelationMatrixRowHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { List<MatrixRowHolder> rowsToAgg = new ArrayList<MatrixRowHolder>(); try {/*from www . java 2 s . c o m*/ for (BytesWritable b : values) { byte[] bytes = b.getBytes(); rowsToAgg.add(deserialize(bytes)); } } catch (ClassNotFoundException ex) { throw new IOException(ex); } if (rowsToAgg.size() > 0) { int rowNum = rowsToAgg.get(0).getRowNumber(); List<double[]> rows = new ArrayList<double[]>(); List<int[]> coOcc = null; if (!m_missingsWereReplacedWithMeans) { coOcc = new ArrayList<int[]>(); } for (MatrixRowHolder r : rowsToAgg) { if (r.getRowNumber() != rowNum) { throw new IOException("Matrix row numbers for this key appear to differ!"); } rows.add(r.getRow()); if (!m_missingsWereReplacedWithMeans) { coOcc.add(r.getCoOccurrencesCounts()); } } try { double[] aggregated = m_task.aggregate(rowsToAgg.get(0).getRowNumber(), rows, coOcc, m_headerWithSummaryAtts, m_missingsWereReplacedWithMeans, m_covariance, m_deleteClassIfSet); // assemble Text key (row num) and Text row (space separated // values) Text outKey = new Text(); outKey.set("" + rowNum); StringBuilder b = new StringBuilder(); for (int i = 0; i < aggregated.length; i++) { if (i < aggregated.length - 1) { b.append("" + aggregated[i]).append(" "); } else { b.append("" + aggregated[i]); } } Text outVal = new Text(); outVal.set(b.toString()); context.write(outKey, outVal); } catch (DistributedWekaException e) { throw new IOException(e); } catch (InterruptedException e) { throw new IOException(e); } } }