List of usage examples for org.apache.commons.collections.buffer PriorityBuffer iterator
public Iterator iterator()
From source file:co.cask.hydrator.plugin.batch.aggreagtor.aggregator.Sampling.java
@Override public void aggregate(String groupKey, Iterator<StructuredRecord> iterator, Emitter<StructuredRecord> emitter) throws Exception { int finalSampleSize = 0; if (config.sampleSize != null) { finalSampleSize = config.sampleSize; }// w w w . j av a 2 s . co m if (config.samplePercentage != null) { finalSampleSize = Math.round((config.samplePercentage / 100) * config.totalRecords); } switch (TYPE.valueOf(config.samplingType.toUpperCase())) { case SYSTEMATIC: if (config.overSamplingPercentage != null) { finalSampleSize = Math .round(finalSampleSize + (finalSampleSize * (config.overSamplingPercentage / 100))); } int sampleIndex = Math.round(config.totalRecords / finalSampleSize); Float random = new Float(0); if (config.random != null) { random = config.random; } else { random = new Random().nextFloat(); } int firstSampleIndex = Math.round(sampleIndex * random); List<StructuredRecord> records = IteratorUtils.toList(iterator); int counter = 0; emitter.emit(records.get(firstSampleIndex)); counter++; while (counter < finalSampleSize) { int index = firstSampleIndex + (counter * sampleIndex); emitter.emit(records.get(index - 1)); counter++; } break; case RESERVOIR: PriorityBuffer sampleData = new PriorityBuffer(true, new Comparator<StructuredRecord>() { @Override public int compare(StructuredRecord o1, StructuredRecord o2) { if ((float) o1.get("random") < (float) o2.get("random")) { return 1; } else if ((float) o1.get("random") > (float) o2.get("random")) { return -1; } else { return 0; } } }); int count = 0; Random randomValue = new Random(); List<StructuredRecord> recordArray = IteratorUtils.toList(iterator); Schema inputSchema = recordArray.get(0).getSchema(); Schema schemaWithRandomField = createSchemaWithRandomField(inputSchema); while (count < finalSampleSize) { StructuredRecord record = recordArray.get(0); sampleData.add(getSampledRecord(record, randomValue.nextFloat(), schemaWithRandomField)); count++; } while (count < recordArray.size()) { StructuredRecord structuredRecord = (StructuredRecord) sampleData.get(); Float randomFloat = randomValue.nextFloat(); if ((float) structuredRecord.get("random") < randomFloat) { sampleData.remove(); StructuredRecord record = recordArray.get(count); sampleData.add(getSampledRecord(record, randomFloat, structuredRecord.getSchema())); } count++; } Iterator<StructuredRecord> sampleDataIterator = sampleData.iterator(); while (sampleDataIterator.hasNext()) { StructuredRecord sampledRecord = sampleDataIterator.next(); StructuredRecord.Builder builder = StructuredRecord.builder(inputSchema); for (Schema.Field field : sampledRecord.getSchema().getFields()) { if (!field.getName().equalsIgnoreCase("random")) { builder.set(field.getName(), sampledRecord.get(field.getName())); } } emitter.emit(builder.build()); } break; } }