List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.alectenharmsel.research.WholeBlockRecordReader.java
License:Apache License
public boolean nextKeyValue() throws IOException, InterruptedException { if (!processed) { System.err.println("start is " + start); Path file = fileSplit.getPath(); String tmp = file.toString(); System.err.println("File: " + tmp); currKey.set(tmp);/*from ww w .jav a 2 s. c o m*/ System.err.println("Reached this point"); FileSystem fs = file.getFileSystem(conf); System.err.println("fs blocksize: " + fs.getDefaultBlockSize(file)); System.err.println("linecount blocksize: " + blockSize); byte[] contents; FSDataInputStream in = null; try { in = fs.open(file); System.err.println("getPos(): " + in.getPos()); if ((start + blockSize) > fileLength) { blockSize = (int) (fileLength - start); processed = true; } contents = new byte[blockSize]; //IOUtils.readFully(in, contents, start, blockSize); //IOUtils.readFully(in, contents, 0, blockSize); in.readFully(start, contents); start += blockSize; currValue.set(contents); } finally { IOUtils.closeStream(in); } return true; } return false; }
From source file:com.alexholmes.hadooputils.combine.avro.mapred.CombineAvroInputFormatTest.java
License:Apache License
@SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile();/* w w w. ja v a2 s. com*/ job.setJobName("wordcount"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); job.setInputFormat(CombineAvroInputFormat.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(); }
From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileInputFormat.java
License:Apache License
@Override @SuppressWarnings("unchecked") protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> files = super.listStatus(job); int len = files.size(); for (int i = 0; i < len; ++i) { FileStatus file = files.get(i);/*from w w w . j av a 2 s. c om*/ if (file.isDir()) { // it's a MapFile Path p = file.getPath(); FileSystem fs = p.getFileSystem(job.getConfiguration()); // use the data file files.set(i, fs.getFileStatus(new Path(p, MapFile.DATA_FILE_NAME))); } } return files; }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java
License:Apache License
protected void initialize(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from www.j a va 2s. c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; String rowDelim = job.get("textinputformat.record.delimiter", null); if (codec != null) { if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(codec.createInputStream(fileIn), job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(fileIn, job); } } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.alexholmes.hadooputils.sort.DelimitedTextOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { SortConfig sortConf = new SortConfig(job); boolean isCompressed = getCompressOutput(job); String lineSeparator = sortConf.getRowSeparator("\n"); byte[] hexcode = SortConfig.getHexDelimiter(lineSeparator); lineSeparator = (hexcode != null) ? new String(hexcode, "UTF-8") : lineSeparator; if (!isCompressed) { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new DelimitedLineRecordWriter<K, V>(fileOut, lineSeparator); } else {/*from ww w. j av a 2 s .c om*/ Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job); Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new DelimitedLineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), lineSeparator); } }
From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java
License:Apache License
@Override protected void initialize(Configuration job, FileSplit split) throws IOException { start = split.getStart();//from w w w . ja v a2s. c o m end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header String rowDelim = job.get("textinputformat.record.delimiter", null); if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.alexholmes.hadooputils.sort.Sort.java
License:Apache License
/** * The driver for the sort MapReduce job. * * @param jobConf sort configuration * @param numMapTasks number of map tasks * @param numReduceTasks number of reduce tasks * @param sampler sampler, if required * @param codecClass the compression codec for compressing final outputs * @param mapCodecClass the compression codec for compressing intermediary map outputs * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes * for the job output files * @param inputDirAsString input directory in CSV-form * @param outputDirAsString output directory * @return true if the job completed successfully * @throws IOException if something went wrong * @throws URISyntaxException if a URI wasn't correctly formed *//*from w w w. j a v a2 s . c o m*/ public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks, final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass, final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes, final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException { jobConf.setJarByClass(Sort.class); jobConf.setJobName("sorter"); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (numMapTasks != null) { jobConf.setNumMapTasks(numMapTasks); } if (numReduceTasks != null) { jobConf.setNumReduceTasks(numReduceTasks); } else { int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sort.reduces_per_host"); if (sortReduces != null) { numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(numReduces); } jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(SortReduce.class); jobConf.setInputFormat(SortInputFormat.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); if (mapCodecClass != null) { jobConf.setMapOutputCompressorClass(mapCodecClass); } if (codecClass != null) { jobConf.setBoolean("mapred.output.compress", true); jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class); } FileInputFormat.setInputPaths(jobConf, inputDirAsString); FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString)); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; FileSystem fileSystem = FileSystem.get(jobConf); if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) { inputDir = inputDir.getParent(); } inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + jobConf.getNumReduceTasks() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds."); if (jobResult.isSuccessful()) { if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) { new LzoIndexer(jobConf).index(new Path(outputDirAsString)); } return true; } return false; }
From source file:com.alexholmes.hadooputils.sort.SortInputSampler.java
License:Apache License
public static <K, V> void writePartitionFile(JobConf job, Sampler<K, V> sampler) throws IOException { Configuration conf = job;// ww w .ja va2 s.c o m // Use the input format defined in the job. NOT, the one provided by // the parent class's writePartitionFile() method, which will be a plain // TextInputFormat, by default final InputFormat inf = job.getInputFormat(); int numPartitions = job.getNumReduceTasks(); K[] samples = (K[]) sampler.getSample(inf, job); RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
From source file:com.alexholmes.hdfsslurper.Configurator.java
License:Apache License
public static void validateSameFileSystem(Path p1, Path p2, Configuration config) throws IOException, ConfigSettingException { FileSystem fs1 = p1.getFileSystem(config); FileSystem fs2 = p2.getFileSystem(config); if (!compareFs(fs1, fs2)) { throw new ConfigSettingException("The two paths must exist on the same file system: " + p1 + "," + p2); }/*from w w w .ja v a 2 s . co m*/ if (p1.equals(p2)) { throw new ConfigSettingException("The paths must be distinct: " + p1); } }
From source file:com.alexholmes.hdfsslurper.Configurator.java
License:Apache License
public static void testCreateDir(Path p, Configuration conf) throws IOException, ConfigSettingException, FileSystemMkdirFailed { FileSystem fs = p.getFileSystem(conf); if (fs.exists(p) && !fs.getFileStatus(p).isDir()) { throw new ConfigSettingException("Directory appears to be a file: '" + p + "'"); }// w ww . j a v a 2 s. c o m if (!fs.exists(p)) { log.info("Attempting creation of directory: " + p); if (!fs.mkdirs(p)) { throw new FileSystemMkdirFailed("Failed to create directory: '" + p + "'"); } } }