List of usage examples for org.apache.hadoop.fs FileSystem getDefaultBlockSize
public long getDefaultBlockSize(Path f)
From source file:edu.umn.cs.spatialHadoop.operations.SJMR.java
License:Open Source License
public static <S extends Shape> long sjmr(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, SJMR.class); LOG.info("SJMR journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {//from w w w .j a v a2 s.co m outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("SJMR"); job.setMapperClass(SJMRMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(SJMRReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Touches.java
License:Open Source License
public static <S extends Shape> long touches(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Touches.class); LOG.info("Touches journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/*ww w . j a v a 2 s .c o m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Touches"); job.setMapperClass(TouchesMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(TouchesReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.operations.Within.java
License:Open Source License
public static <S extends Shape> long within(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException, InterruptedException { JobConf job = new JobConf(params, Within.class); LOG.info("Within journey starts ...."); FileSystem inFs = inFiles[0].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { FileSystem outFs = FileSystem.get(job); do {/*from w w w . j a v a 2 s .c o m*/ outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } FileSystem outFs = outputPath.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setJobName("Within"); job.setMapperClass(WithinMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IndexedText.class); job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(), inFs.getFileStatus(inFiles[1]).getBlockSize())); job.setReducerClass(WithinReduce.class); job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks())); job.setInputFormat(ShapeLineInputFormat.class); if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, inFiles); // Calculate and set the dimensions of the grid to use in the map phase long total_size = 0; Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); for (Path file : inFiles) { FileSystem fs = file.getFileSystem(params); Rectangle file_mbr = FileMBR.fileMBR(file, params); mbr.expand(file_mbr); total_size += FileUtil.getPathSize(fs, file); } // If the largest file is globally indexed, use its partitions total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f); int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20); int num_cells = (int) Math.max(1, total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath)); LOG.info("Number of cells is configured to be " + num_cells); OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive); OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_cells); OperationsParams.setShape(job, PartitionGrid, gridInfo); TextOutputFormat.setOutputPath(job, outputPath); if (OperationsParams.isLocal(job, inFiles)) { // Enforce local execution if explicitly set by user or for small files job.set("mapred.job.tracker", "local"); } // Start the job RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }
From source file:edu.umn.cs.spatialHadoop.RandomSpatialGenerator.java
License:Open Source License
private static void generateMapReduce(Path outFile, OperationsParams params) throws IOException { JobConf job = new JobConf(params, RandomSpatialGenerator.class); job.setJobName("Generator"); Shape shape = params.getShape("shape"); FileSystem outFs = outFile.getFileSystem(job); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); // Set input format and map class job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Repartition.RepartitionMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); String sindex = params.get("sindex"); Rectangle mbr = params.getShape("mbr").getMBR(); CellInfo[] cells;/*from w w w .j av a 2 s. com*/ if (sindex == null) { cells = new CellInfo[] { new CellInfo(1, mbr) }; } else if (sindex.equals("grid")) { GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); FileSystem fs = outFile.getFileSystem(job); long blocksize = fs.getDefaultBlockSize(outFile); long size = params.getSize("size"); int numOfCells = Repartition.calculateNumberOfPartitions(job, size, fs, outFile, blocksize); gridInfo.calculateCellDimensions(numOfCells); cells = gridInfo.getAllCells(); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } SpatialSite.setCells(job, cells); // Do not set a reduce function. Use the default identity reduce function if (cells.length == 1) { // All objects are in one partition. No need for a reduce phase job.setNumReduceTasks(0); } else { // More than one partition. Need a reduce phase to group shapes of the // same partition together job.setReducerClass(RepartitionReduce.class); job.setNumReduceTasks( Math.max(1, Math.min(cells.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10))); } // Set output path FileOutputFormat.setOutputPath(job, outFile); if (sindex == null || sindex.equals("grid")) { job.setOutputFormat(GridOutputFormat.class); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } JobClient.runJob(job); // TODO move the following part to OutputCommitter // Concatenate all master files into one file FileStatus[] resultFiles = outFs.listStatus(outFile, new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains("_master"); } }); String ext = resultFiles[0].getPath().getName() .substring(resultFiles[0].getPath().getName().lastIndexOf('.')); Path masterPath = new Path(outFile, "_master" + ext); OutputStream destOut = outFs.create(masterPath); byte[] buffer = new byte[4096]; for (FileStatus f : resultFiles) { InputStream in = outFs.open(f.getPath()); int bytes_read; do { bytes_read = in.read(buffer); if (bytes_read > 0) destOut.write(buffer, 0, bytes_read); } while (bytes_read > 0); in.close(); outFs.delete(f.getPath(), false); } destOut.close(); }
From source file:edu.umn.cs.spatialHadoop.RandomSpatialGenerator.java
License:Open Source License
/** * Generates random rectangles and write the result to a file. * @param outFS - The file system that contains the output file * @param outputFile - The file name to write to. If either outFS or * outputFile is null, data is generated to the standard output * @param mbr - The whole MBR to generate in * @param shape /*w ww .j av a 2 s. c o m*/ * @param totalSize - The total size of the generated file * @param blocksize * @throws IOException */ private static void generateFileLocal(Path outFile, OperationsParams params) throws IOException { JobConf job = new JobConf(params, RandomSpatialGenerator.class); FileSystem outFS = outFile.getFileSystem(params); long blocksize = outFS.getDefaultBlockSize(outFile); String sindex = params.get("sindex"); Rectangle mbr = params.getShape("mbr").getMBR(); long totalSize = params.getSize("size"); // Calculate the dimensions of each partition based on gindex type CellInfo[] cells; if (sindex == null) { cells = new CellInfo[] { new CellInfo(1, mbr) }; } else if (sindex.equals("grid")) { int num_partitions = Repartition.calculateNumberOfPartitions(params, totalSize, outFS, outFile, blocksize); GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2); gridInfo.calculateCellDimensions(num_partitions); cells = gridInfo.getAllCells(); } else { throw new RuntimeException("Unsupported spatial index: " + sindex); } outFS.mkdirs(outFile); ShapeRecordWriter<Shape> writer; if (sindex == null || sindex.equals("grid")) { writer = new GridRecordWriter<Shape>(outFile, job, null, cells); } else { throw new RuntimeException("Unupoorted spatial idnex: " + sindex); } int rectSize = params.getInt("rectsize", 100); long seed = params.getLong("seed", System.currentTimeMillis()); float circleThickness = params.getFloat("thickness", 1); DistributionType type = SpatialSite.getDistributionType(params, "type", DistributionType.UNIFORM); Shape shape = params.getShape("shape"); long t1 = System.currentTimeMillis(); RandomShapeGenerator<Shape> generator = new RandomShapeGenerator<Shape>(totalSize, mbr, type, rectSize, seed, circleThickness); Rectangle key = generator.createKey(); while (generator.next(key, shape)) { // Serialize it to text writer.write(NullWritable.get(), shape); } writer.close(null); long t2 = System.currentTimeMillis(); System.out.println("Generation time: " + (t2 - t1) + " millis"); }
From source file:edu.umn.cs.spatialHadoop.visualization.SingleLevelPlot.java
License:Open Source License
/** * Generates a single level using a MapReduce job and returns the created job. * @param inFiles//from w w w .j a v a2 s . c om * @param outFile * @param plotterClass * @param params * @return * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static Job plotMapReduce(Path[] inFiles, Path outFile, Class<? extends Plotter> plotterClass, OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { Plotter plotter; try { plotter = plotterClass.newInstance(); } catch (InstantiationException e) { throw new RuntimeException("Error creating rastierizer", e); } catch (IllegalAccessException e) { throw new RuntimeException("Error creating rastierizer", e); } Job job = new Job(params, "SingleLevelPlot"); job.setJarByClass(SingleLevelPlot.class); job.setJobName("SingleLevelPlot"); // Set plotter Configuration conf = job.getConfiguration(); Plotter.setPlotter(conf, plotterClass); // Set input file MBR Rectangle inputMBR = (Rectangle) params.getShape("mbr"); Rectangle drawRect = (Rectangle) params.getShape("rect"); if (inputMBR == null) inputMBR = drawRect != null ? drawRect : FileMBR.fileMBR(inFiles, params); OperationsParams.setShape(conf, InputMBR, inputMBR); if (drawRect != null) OperationsParams.setShape(conf, SpatialInputFormat3.InputQueryRange, drawRect); // Adjust width and height if aspect ratio is to be kept int imageWidth = conf.getInt("width", 1000); int imageHeight = conf.getInt("height", 1000); if (params.getBoolean("keepratio", true)) { // Adjust width and height to maintain aspect ratio if (inputMBR.getWidth() / inputMBR.getHeight() > (double) imageWidth / imageHeight) { // Fix width and change height imageHeight = (int) (inputMBR.getHeight() * imageWidth / inputMBR.getWidth()); // Make divisible by two for compatibility with ffmpeg if (imageHeight % 2 == 1) imageHeight--; conf.setInt("height", imageHeight); } else { imageWidth = (int) (inputMBR.getWidth() * imageHeight / inputMBR.getHeight()); conf.setInt("width", imageWidth); } } boolean merge = conf.getBoolean("merge", true); // Set input and output job.setInputFormatClass(SpatialInputFormat3.class); SpatialInputFormat3.setInputPaths(job, inFiles); if (conf.getBoolean("output", true)) { if (merge) { job.setOutputFormatClass(CanvasOutputFormat.class); conf.setClass("mapred.output.committer.class", CanvasOutputFormat.ImageWriterOld.class, org.apache.hadoop.mapred.OutputCommitter.class); } else { job.setOutputFormatClass(ImageOutputFormat.class); } CanvasOutputFormat.setOutputPath(job, outFile); } else { job.setOutputFormatClass(NullOutputFormat.class); } // Set mapper and reducer based on the partitioning scheme String partition = conf.get("partition", "none"); ClusterStatus clusterStatus = new JobClient(new JobConf()).getClusterStatus(); if (partition.equalsIgnoreCase("none")) { LOG.info("Using no-partition plot"); job.setMapperClass(NoPartitionPlotMap.class); job.setCombinerClass(NoPartitionPlotCombine.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(plotter.getCanvasClass()); if (merge) { int numSplits = new SpatialInputFormat3().getSplits(job).size(); job.setReducerClass(NoPartitionPlotReduce.class); // Set number of reduce tasks according to cluster status int maxReduce = Math.max(1, clusterStatus.getMaxReduceTasks() * 7 / 8); job.setNumReduceTasks(Math.max(1, Math.min(maxReduce, numSplits / maxReduce))); } else { job.setNumReduceTasks(0); } } else { LOG.info("Using repartition plot"); Partitioner partitioner; if (partition.equals("pixel")) { // Special case for pixel level partitioning as it depends on the // visualization parameters partitioner = new GridPartitioner(inputMBR, imageWidth, imageHeight); } else if (partition.equals("grid")) { int numBlocks = 0; for (Path in : inFiles) { FileSystem fs = in.getFileSystem(params); long size = FileUtil.getPathSize(fs, in); long blockSize = fs.getDefaultBlockSize(in); numBlocks += Math.ceil(size / (double) blockSize); } int numPartitions = numBlocks * 1000; int gridSize = (int) Math.ceil(Math.sqrt(numPartitions)); partitioner = new GridPartitioner(inputMBR, gridSize, gridSize); } else { // Use a standard partitioner as created by the indexer partitioner = Indexer.createPartitioner(inFiles, outFile, conf, partition); } Shape shape = params.getShape("shape"); job.setMapperClass(RepartitionPlotMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setReducerClass(RepartitionPlotReduce.class); // Set number of reducers according to cluster size job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10)); Partitioner.setPartitioner(conf, partitioner); } // Use multithreading in case the job is running locally conf.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors()); // Start the job if (params.getBoolean("background", false)) { // Run in background job.submit(); } else { job.waitForCompletion(params.getBoolean("verbose", false)); } return job; }
From source file:info.halo9pan.word2vec.hadoop.mr.SortInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * //from w w w. j a va2s. c o m * @param job * the job to sample * @param partFile * where to write the output file to * @throws Throwable * if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final SortInputFormat inFormat = new SortInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:org.apache.gobblin.data.management.copy.CopyableFile.java
License:Apache License
/** * @return desired block size for destination file. *//* w w w. j av a2s .co m*/ public long getBlockSize(FileSystem targetFs) { return getPreserve().preserve(PreserveAttributes.Option.BLOCK_SIZE) ? getOrigin().getBlockSize() : targetFs.getDefaultBlockSize(this.destination); }
From source file:org.apache.hadoop.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample//from ww w.j a v a 2s.c om * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min( conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:org.apache.nifi.processors.hadoop.AbstractHadoopProcessor.java
License:Apache License
HdfsResources resetHDFSResources(String configResources, ProcessContext context) throws IOException { Configuration config = new ExtendedConfiguration(getLogger()); config.setClassLoader(Thread.currentThread().getContextClassLoader()); getConfigurationFromResources(config, configResources); // give sub-classes a chance to process configuration preProcessConfiguration(config, context); // first check for timeout on HDFS connection, because FileSystem has a hard coded 15 minute timeout checkHdfsUriForTimeout(config);// w w w . j a v a 2 s .c o m // disable caching of Configuration and FileSystem objects, else we cannot reconfigure the processor without a complete // restart String disableCacheName = String.format("fs.%s.impl.disable.cache", FileSystem.getDefaultUri(config).getScheme()); config.set(disableCacheName, "true"); // If kerberos is enabled, create the file system as the kerberos principal // -- use RESOURCE_LOCK to guarantee UserGroupInformation is accessed by only a single thread at at time FileSystem fs; UserGroupInformation ugi; synchronized (RESOURCES_LOCK) { if (SecurityUtil.isSecurityEnabled(config)) { String principal = context.getProperty(kerberosProperties.getKerberosPrincipal()) .evaluateAttributeExpressions().getValue(); String keyTab = context.getProperty(kerberosProperties.getKerberosKeytab()) .evaluateAttributeExpressions().getValue(); ugi = SecurityUtil.loginKerberos(config, principal, keyTab); fs = getFileSystemAsUser(config, ugi); } else { config.set("ipc.client.fallback-to-simple-auth-allowed", "true"); config.set("hadoop.security.authentication", "simple"); ugi = SecurityUtil.loginSimple(config); fs = getFileSystemAsUser(config, ugi); } } getLogger().debug("resetHDFSResources UGI {}", new Object[] { ugi }); final Path workingDir = fs.getWorkingDirectory(); getLogger().info( "Initialized a new HDFS File System with working dir: {} default block size: {} default replication: {} config: {}", new Object[] { workingDir, fs.getDefaultBlockSize(workingDir), fs.getDefaultReplication(workingDir), config.toString() }); return new HdfsResources(config, fs, ugi); }