List of usage examples for org.apache.hadoop.mapred Reporter incrCounter
public abstract void incrCounter(String group, String counter, long amount);
From source file:RandIntPartSamplerMapper.java
License:Apache License
@Override public void map(NullWritable lineNum, TextArrayWritable transactionsArrWr, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { reporter.incrCounter("FIMMapperStart", String.valueOf(id), System.currentTimeMillis()); Random rand = new Random(); Writable[] transactions = transactionsArrWr.get(); int transactionsNum = transactions.length; System.out.println("transactionsNum: " + transactionsNum); for (int i = 0; i < toSample; i++) { int sampledIndex = rand.nextInt(transactionsNum); output.collect(sampleDestinations[i], (Text) transactions[sampledIndex]); }/*from w w w. jav a2s. c o m*/ reporter.incrCounter("FIMMapperEnd", String.valueOf(id), System.currentTimeMillis()); }
From source file:FIMReducer.java
License:Apache License
@Override public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { long startTime = System.currentTimeMillis(); if (!set) {/*from w w w . jav a 2 s. co m*/ reporter.incrCounter("FIMReducerStart", String.valueOf(id), startTime); reporter.incrCounter("FIMReducerEnd", String.valueOf(id), startTime); set = true; } // This is a very crappy way of checking whether we got the // right number of transactions. It may not be too inefficient // though. ArrayList<Text> transactions = new ArrayList<Text>(sampleSize); while (values.hasNext()) { Text trans = new Text(values.next().toString()); transactions.add(trans); } if (sampleSize != transactions.size()) { System.out.println("WRONG NUMBER OF TRANSACTIONS!"); } System.out.println("samplesize: " + sampleSize + " received: " + transactions.size()); FPgrowth.mineFrequentItemsets(transactions.iterator(), transactions.size(), minFreqPercent - (epsilon * 50), output); long endTime = System.currentTimeMillis(); reporter.incrCounter("FIMReducerEnd", String.valueOf(id), endTime - startTime); }
From source file:InputSamplerMapper.java
License:Apache License
@Override public void map(LongWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { reporter.incrCounter("FIMMapperStart", String.valueOf(id), System.currentTimeMillis()); IntArrayWritable arr = (IntArrayWritable) map.get(key); if (arr != null) { for (Writable element : arr.get()) { output.collect((IntWritable) element, value); }/*from w w w . ja v a2s . com*/ } reporter.incrCounter("FIMMapperEnd", String.valueOf(id), System.currentTimeMillis()); }
From source file:colossal.pipe.ColHadoopMapper.java
License:Apache License
@SuppressWarnings("unchecked") @Override//from w w w. j a v a2 s . co m public void map(KEY wrapper, VALUE value, OutputCollector<KO, VO> collector, Reporter reporter) throws IOException { if (this.context == null) { KeyExtractor<GenericData.Record, OUT> extractor = new ReflectionKeyExtractor<OUT>(schema, groupBy, sortBy); this.context = new ColContext<OUT>(new Collector(collector, extractor), reporter); } if (isTextInput) { mapper.map((IN) value, out, context); } else if (isStringInput) { mapper.map((IN) ((Text) value).toString(), out, context); } else if (isJsonInput) { String json = ((Text) value).toString(); if (shouldSkip(json)) return; // inefficient implementation of json to avro... // more efficient would be JsonToClass.jsonToRecord: // mapper.map((IN) JsonToClass.jsonToRecord(json, inSchema), out, context); // silly conversion approach - serialize then deserialize try { GenericContainer c = JsonToGenericRecord.jsonToRecord(json, inSchema); GenericDatumWriter<GenericContainer> writer = new GenericDatumWriter<GenericContainer>(inSchema); ByteArrayOutputStream bos = new ByteArrayOutputStream(); writer.setSchema(inSchema); writer.write(c, new BinaryEncoder(bos)); byte[] data = bos.toByteArray(); GenericDatumReader<IN> reader = new SpecificDatumReader<IN>(inSchema); reader.setSchema(inSchema); IN converted = reader.read(null, DecoderFactory.defaultFactory().createBinaryDecoder(data, null)); mapper.map(converted, out, context); } catch (JsonParseException jpe) { System.err.println("Failed to parse " + json + ": " + jpe.getMessage()); reporter.incrCounter("ColHadoopMapper", "json-parse-error", 1L); if (++parseErrors > maxAllowedErrors) { throw new RuntimeException(jpe); } } } else { mapper.map(((AvroWrapper<IN>) wrapper).datum(), out, context); } }
From source file:com.blackberry.logdriver.mapred.avro.AvroBlockWriterMapper.java
License:Apache License
@Override public void map(AvroFileHeader key, BytesWritable value, OutputCollector<BytesWritable, NullWritable> output, Reporter reporter) throws IOException { byte[] valueBytes = null; if (header.getSyncMarker() == null) { LOG.info("Writing new header for new file: {}", key.toString()); header.set(key);/* w ww . ja v a2s .c om*/ output.collect(new BytesWritable(header.toBytes()), null); } else { AvroFileHeader newHeader = key; if (!header.getSchema().equals(newHeader.getSchema())) { throw new IOException("Schemas in files do not match."); } if (!header.getCodec().equals(newHeader.getCodec())) { throw new IOException("Codecs in files do not match."); } } if (value.getLength() == 0) { return; } valueBytes = Arrays.copyOfRange(value.getBytes(), 0, value.getLength()); output.collect(new BytesWritable(valueBytes), null); output.collect(new BytesWritable(header.getSyncMarker()), null); reporter.incrCounter("Avro Block", "Blocks processed", 1); reporter.incrCounter("Avro Block", "Bytes processed", value.getLength() + 16); }
From source file:com.digitalpebble.behemoth.BehemothMapper.java
License:Apache License
public void map(Text key, BehemothDocument inputDoc, OutputCollector<Text, BehemothDocument> output, Reporter reporter) throws IOException { boolean keep = docFilter.keep(inputDoc); if (!keep) {/*from w w w .java 2 s.co m*/ reporter.incrCounter("BehemothMapper", "DOC SKIPPED BY FILTERS", 1); return; } output.collect(key, inputDoc); }
From source file:com.digitalpebble.behemoth.BehemothReducer.java
License:Apache License
public void reduce(Text key, Iterator<BehemothDocument> doc, OutputCollector<Text, BehemothDocument> output, Reporter reporter) throws IOException { while (doc.hasNext()) { BehemothDocument inputDoc = doc.next(); boolean keep = docFilter.keep(inputDoc); if (!keep) { reporter.incrCounter("BehemothReducer", "DOC SKIPPED BY FILTERS", 1); continue; }/* w w w.java 2 s. co m*/ output.collect(key, inputDoc); } }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
public void map(Text key, BehemothDocument doc, OutputCollector<Text, BehemothDocument> collector, Reporter reported) throws IOException { // get the text if (doc.getText() == null || doc.getText().length() < 2) { reported.incrCounter("text classification", "MISSING TEXT", 1); filterOrCollect(key, doc, collector, reported); return;/*from ww w . j av a 2 s .c o m*/ } // use the quick and dirty tokenization String[] tokens = Tokenizer.tokenize(doc.getText(), lowerCase); // TODO use annotations instead? Document tcdoc = classifier.createDocument(tokens); double[] scores; try { scores = classifier.classify(tcdoc); } catch (Exception e) { LOG.error("Exception while classifying", e); filterOrCollect(key, doc, collector, reported); reported.incrCounter("text classification", "EXCEPTION", 1); return; } String label = classifier.getBestLabel(scores); doc.getMetadata(true).put(new Text(docFeaturename), new Text(label)); filterOrCollect(key, doc, collector, reported); reported.incrCounter("text classification", label, 1); }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
private void filterOrCollect(Text key, BehemothDocument doc, OutputCollector<Text, BehemothDocument> collector, Reporter reported) throws IOException { if (filter.keep(doc)) { collector.collect(key, doc);// www . j a va2 s . c om } else reported.incrCounter("text classification", "FILTERED", 1l); }
From source file:com.digitalpebble.behemoth.gate.GATEProcessor.java
License:Apache License
public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try {//from w w w .ja va2 s . c o m gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }