Java tutorial
package com.splout.db.hadoop; /* * #%L * Splout SQL Hadoop library * %% * Copyright (C) 2012 Datasalt Systems S.L. * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.tuplemr.MapOnlyJobBuilder; import com.datasalt.pangool.tuplemr.MultipleOutputsCollector; import com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper; import com.datasalt.pangool.tuplemr.mapred.lib.output.HadoopOutputFormat; import com.datasalt.pangool.utils.TaskAttemptContextFactory; import com.splout.db.common.PartitionMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.mockito.Mockito; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * This class samples a list of {@link TableInput} files that produce a certain Table Schema. There are two sampling * methods supported: * <ul> * <li>FULL_SCAN: It uses a Map-Only Job for performing Reservoir Sampling over the whole dataset.</li> * <li>RANDOM: Inspired by Hadoop's TeraInputFormat. A Hadoop Job is not needed. Consecutive records are read from each * InputSplit. </li> * </ul> * Sampling can be used by {@link TablespaceGenerator} for determining a {@link PartitionMap} based on the approximated * distribution of the keys. */ @SuppressWarnings({ "serial", "rawtypes" }) public class TupleSampler implements Serializable { private final static Log logger = LogFactory.getLog(TupleSampler.class); private final SamplingType samplingType; private final SamplingOptions options; private Class callingClass; public enum SamplingType { FULL_SCAN, RANDOM } public static class TupleSamplerException extends Exception { public TupleSamplerException(String reason) { super(reason); } public TupleSamplerException(Exception e) { super(e); } public TupleSamplerException(String message, Throwable cause) { super(message, cause); } } // Each sampling algorithm may have its own options but there are some which are common to both public static abstract class SamplingOptions extends HashMap<String, Object> { public Long getMaxInputSplitSize() { return (Long) this.get("maxInputSplitSize"); } public void setMaxInputSplitSize(Long maxInputSplitSize) { this.put("maxInputSplitSize", maxInputSplitSize); } } // Options for RANDOM sampling public static class RandomSamplingOptions extends SamplingOptions { public RandomSamplingOptions() { super(); setMaxSplitsToVisit(1000); } public int getMaxSplitsToVisit() { return (Integer) this.get("maxSplitsToVisit"); } public void setMaxSplitsToVisit(int maxSplitsToVisit) { this.put("maxSplitsToVisit", maxSplitsToVisit); } } // Options for FULLSCAN sampling public static class FullScanSamplingOptions extends SamplingOptions { } public TupleSampler(SamplingType samplingType, SamplingOptions options, Class callingClass) { this.samplingType = samplingType; this.options = options; this.callingClass = callingClass; } public long sample(TablespaceSpec tablespace, Configuration hadoopConf, long sampleSize, Path outFile) throws TupleSamplerException { // 1 - Determine Input Splits // 2 - Launch sampling with the selected method // 3 - Recovering results List<InputSplit> splits = new ArrayList<InputSplit>(); Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat = new HashMap<InputSplit, InputFormat<ITuple, NullWritable>>(); Map<InputSplit, RecordProcessor> recordProcessorPerSplit = new HashMap<InputSplit, RecordProcessor>(); Map<InputSplit, Map<String, String>> specificHadoopConfMap = new HashMap<InputSplit, Map<String, String>>(); Map<InputSplit, TableSpec> splitToTableSpec = new HashMap<InputSplit, TableSpec>(); Map<InputSplit, JavascriptEngine> splitToJsEngine = new HashMap<InputSplit, JavascriptEngine>(); try { for (Table table : tablespace.getPartitionedTables()) { // Initialize JavaScript engine if needed JavascriptEngine jsEngine = null; TableSpec tableSpec = table.getTableSpec(); if (tableSpec.getPartitionByJavaScript() != null) { try { jsEngine = new JavascriptEngine(tableSpec.getPartitionByJavaScript()); } catch (Throwable e) { throw new RuntimeException(e); } } for (TableInput tableFile : table.getFiles()) { @SuppressWarnings("deprecation") Job job = new Job(hadoopConf); FileInputFormat.setInputPaths(job, tableFile.getPaths()); if (options.getMaxInputSplitSize() != null) { logger.info("Using max input split size: " + options.getMaxInputSplitSize()); FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize()); } job.setInputFormatClass(FileInputFormat.class); if (tableFile.getSpecificHadoopInputFormatContext() != null) { for (Map.Entry<String, String> specificHadoopConf : tableFile .getSpecificHadoopInputFormatContext().entrySet()) { job.getConfiguration().set(specificHadoopConf.getKey(), specificHadoopConf.getValue()); } } for (InputSplit split : tableFile.getFormat().getSplits(job)) { if (tableFile.getSpecificHadoopInputFormatContext() != null) { specificHadoopConfMap.put(split, tableFile.getSpecificHadoopInputFormatContext()); } splitToFormat.put(split, tableFile.getFormat()); recordProcessorPerSplit.put(split, tableFile.getRecordProcessor()); splitToTableSpec.put(split, tableSpec); splitToJsEngine.put(split, jsEngine); splits.add(split); } } } long retrievedSamples; if (samplingType.equals(SamplingType.RANDOM)) { try { RandomSamplingOptions defOptions = (RandomSamplingOptions) options; // Default sampling method retrievedSamples = randomSampling(sampleSize, hadoopConf, outFile, splits, splitToTableSpec, splitToFormat, specificHadoopConfMap, recordProcessorPerSplit, splitToJsEngine, defOptions.getMaxSplitsToVisit()); } catch (ClassCastException ef) { throw new RuntimeException("Invalid options class: " + options.getClass() + " Expected:" + RandomSamplingOptions.class); } } else { // Reservoir sampling over full data retrievedSamples = fullScanSampling(tablespace, sampleSize, hadoopConf, outFile, splits.size()); } return retrievedSamples; } catch (IOException e) { throw new TupleSamplerException(e); } catch (InterruptedException e) { throw new TupleSamplerException(e); } } /* * Reservoir sampling that scans the full dataset to get the samples * that are used for calculate the partition map. Based on * http://en.wikipedia.org/wiki/Reservoir_sampling. * * Writes a SequenceFile with Text, NullWritable. The key * contains the strings to be used for the partitioning. * * @return The number of samples retrieved */ @SuppressWarnings("deprecation") private long fullScanSampling(TablespaceSpec tablespace, final long sampleSize, Configuration hadoopConf, Path outputPath, final int nSplits) throws TupleSamplerException { MapOnlyJobBuilder builder = new MapOnlyJobBuilder(hadoopConf, "Reservoir Sampling to path " + outputPath); for (Table table : tablespace.getPartitionedTables()) { final TableSpec tableSpec = table.getTableSpec(); final String getPartitionByJavaScript = tableSpec.getPartitionByJavaScript(); for (TableInput inputFile : table.getFiles()) { final RecordProcessor processor = inputFile.getRecordProcessor(); for (Path path : inputFile.getPaths()) { builder.addInput(path, inputFile.getFormat(), new MapOnlyMapper<ITuple, NullWritable, Text, NullWritable>() { final int nSamples = (int) (sampleSize / nSplits); final String[] samples = new String[nSamples]; CounterInterface counterInterface; long recordCounter = 0; JavascriptEngine jsEngine = null; @Override protected void setup(Context context, MultipleOutputsCollector coll) throws IOException, InterruptedException { counterInterface = new CounterInterface(context); // Initialize JavaScript engine if needed if (getPartitionByJavaScript != null) { try { jsEngine = new JavascriptEngine(getPartitionByJavaScript); } catch (Throwable e) { throw new RuntimeException(e); } } } ; // Collect Tuples with decreasing probability // (http://en.wikipedia.org/wiki/Reservoir_sampling) protected void map(ITuple key, NullWritable value, Context context) throws IOException, InterruptedException { ITuple uTuple; try { uTuple = processor.process(key, key.getSchema().getName(), counterInterface); } catch (Throwable e) { throw new RuntimeException(e); } if (uTuple == null) { // user may have filtered the record return; } long reservoirIndex; if (recordCounter < nSamples) { reservoirIndex = recordCounter; } else { reservoirIndex = (long) (Math.random() * recordCounter); } if (reservoirIndex < nSamples) { String pkey = null; try { pkey = TablespaceGenerator.getPartitionByKey(uTuple, tableSpec, jsEngine); } catch (Throwable e) { throw new RuntimeException("Error when determining partition key.", e); } samples[(int) reservoirIndex] = pkey; } recordCounter++; } // Write the in-memory sampled Tuples protected void cleanup(Context context, MultipleOutputsCollector coll) throws IOException, InterruptedException { Text key = new Text(); for (String keyStr : samples) { if (keyStr != null) { key.set(keyStr); context.write(key, NullWritable.get()); } } } }, inputFile.getSpecificHadoopInputFormatContext()); } } } // Set output path Path outReservoirPath = new Path(outputPath + "-reservoir"); builder.setOutput(outReservoirPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class, NullWritable.class); builder.setJarByClass(callingClass); try { Job job = null; job = builder.createJob(); if (!job.waitForCompletion(true)) { throw new TupleSamplerException("Reservoir Sampling failed!"); } } catch (Exception e) { throw new TupleSamplerException("Error creating or launching the sampling job.", e); } finally { try { builder.cleanUpInstanceFiles(); } catch (IOException e) { throw new TupleSamplerException("Error cleaning up the sampling job.", e); } } long retrievedSamples = 0; try { FileSystem outFs = outReservoirPath.getFileSystem(hadoopConf); if (outFs.listStatus(outReservoirPath) == null) { throw new IOException("Output folder not created: the Job failed!"); } retrievedSamples = 0; // Instantiate the writer we will write samples to SequenceFile.Writer writer = new SequenceFile.Writer(outFs, hadoopConf, outputPath, Text.class, NullWritable.class); // Aggregate the output into a single file for being consistent with the other sampling methods for (FileStatus fileStatus : outFs.listStatus(outReservoirPath)) { Path thisPath = fileStatus.getPath(); if (thisPath.getName().startsWith("part-m-")) { SequenceFile.Reader reader = new SequenceFile.Reader(outFs, thisPath, hadoopConf); Text key = new Text(); while (reader.next(key)) { writer.append(key, NullWritable.get()); retrievedSamples++; } reader.close(); } } writer.close(); outFs.delete(outReservoirPath, true); } catch (IOException e) { throw new TupleSamplerException("Error consolidating the sample job results into one file.", e); } return retrievedSamples; } /** * Random sampling method a-la-TeraSort, getting some consecutive samples from each InputSplit * without using a Job. * The output is SequenceFile with keys. * * @return The number of retrieved samples */ private long randomSampling(long sampleSize, Configuration hadoopConf, Path outFile, List<InputSplit> splits, Map<InputSplit, TableSpec> splitToTableSpec, Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat, Map<InputSplit, Map<String, String>> specificHadoopConf, Map<InputSplit, RecordProcessor> recordProcessorPerSplit, Map<InputSplit, JavascriptEngine> splitToJsEngine, int maxSplitsToVisit) throws IOException { // Instantiate the writer we will write samples to FileSystem fs = FileSystem.get(outFile.toUri(), hadoopConf); if (splits.size() == 0) { throw new IllegalArgumentException("There are no splits to sample from!"); } @SuppressWarnings("deprecation") SequenceFile.Writer writer = new SequenceFile.Writer(fs, hadoopConf, outFile, Text.class, NullWritable.class); logger.info("Sequential sampling options, max splits to visit: " + maxSplitsToVisit + ", samples to take: " + sampleSize + ", total number of splits: " + splits.size()); int blocks = Math.min(maxSplitsToVisit, splits.size()); blocks = Math.min((int) sampleSize, blocks); long recordsPerSample = sampleSize / blocks; int sampleStep = splits.size() / blocks; long records = 0; CounterInterface counterInterface = new CounterInterface(null) { public Counter getCounter(String group, String name) { return Mockito.mock(Counter.class); } ; }; // Take N samples from different parts of the input for (int i = 0; i < blocks; ++i) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = null; try { attemptContext = TaskAttemptContextFactory.get(hadoopConf, attemptId); } catch (Exception e) { throw new RuntimeException(e); } InputSplit split = splits.get(sampleStep * i); if (specificHadoopConf.get(split) != null) { for (Map.Entry<String, String> specificConf : specificHadoopConf.get(split).entrySet()) { attemptContext.getConfiguration().set(specificConf.getKey(), specificConf.getValue()); } } logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = null; try { reader = splitToFormat.get(split).createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); RecordProcessor processor = recordProcessorPerSplit.get(split); Text key = new Text(); while (reader.nextKeyValue()) { // ITuple tuple = reader.getCurrentKey(); ITuple uTuple; try { uTuple = processor.process(tuple, tuple.getSchema().getName(), counterInterface); } catch (Throwable e) { throw new RuntimeException(e); } if (uTuple != null) { // user may have filtered the record try { key.set(TablespaceGenerator.getPartitionByKey(uTuple, splitToTableSpec.get(split), splitToJsEngine.get(split))); } catch (Throwable e) { throw new RuntimeException("Error when determining partition key.", e); } writer.append(key, NullWritable.get()); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } } catch (InterruptedException e) { throw new RuntimeException(e); } } writer.close(); return records; } }