List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:cn.edu.hfut.dmic.webcollectorcluster.fetcher.Fetcher.java
@Override public int run(String[] args) throws Exception { JobConf jc = new JobConf(getConf()); jc.setJarByClass(Fetcher.class); jc.setInputFormat(SequenceFileInputFormat.class); Path input = new Path(args[0], "current"); Path output = new Path(args[1]); Configuration conf = CrawlerConfiguration.create(); FileSystem fs = output.getFileSystem(conf); if (fs.exists(output)) { fs.delete(output);// ww w. ja va2s . c o m } FileInputFormat.addInputPath(jc, input); FileOutputFormat.setOutputPath(jc, output); jc.setMapOutputKeyClass(Text.class); jc.setMapOutputValueClass(WebWritable.class); jc.setMapRunnerClass(Fetcher.class); jc.setOutputFormat(FetcherOutputFormat.class); JobClient.runJob(jc); return 0; }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Injector.java
public void inject(Path crawlDir, ArrayList<String> urls) throws IOException, InterruptedException, ClassNotFoundException, Exception { Path crawldb = new Path(crawlDir, "crawldb"); Configuration config = CrawlerConfiguration.create(); System.out.println(config.get("mapred.jar")); FileSystem fs = crawldb.getFileSystem(config); Path tempdb = new Path(crawldb, "temp"); if (fs.exists(tempdb)) { fs.delete(tempdb);/*from w w w. ja va2 s .c om*/ } SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, new Path(tempdb, "info.avro"), Text.class, CrawlDatum.class); for (String url : urls) { CrawlDatum crawldatum = new CrawlDatum(); crawldatum.setUrl(url); crawldatum.setStatus(CrawlDatum.STATUS_DB_INJECTED); writer.append(new Text(url), crawldatum); System.out.println("inject:" + url); } writer.close(); String[] args = new String[] { crawldb.toString(), tempdb.toString() }; ToolRunner.run(CrawlerConfiguration.create(), new Merge(), args); Merge.install(crawldb); if (fs.exists(tempdb)) { fs.delete(tempdb); } }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static Job createJob(Configuration conf, Path crawldb) throws IOException { Job job = new Job(conf); //job.setJarByClass(Merge.class); job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }/*from w ww .jav a 2 s. c o m*/ if (fs.exists(newdb)) { fs.delete(newdb); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job; }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static void install(Path crawldb) throws IOException { FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); Path olddb = new Path(crawldb, "old"); if (fs.exists(currentdb)) { if (fs.exists(olddb)) { fs.delete(olddb);// w ww. j av a 2s . c o m } fs.rename(currentdb, olddb); } fs.mkdirs(crawldb); fs.rename(newdb, currentdb); }
From source file:cn.edu.xmu.dm.mapreduce.Sort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.// ww w . jav a2 s . com * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "Sorter"); job.setJarByClass(Sort.class); JobConf jobConf = new JobConf(getConf(), Sort.class); jobConf.setJobName("sorter"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = jobConf.get("test.sort.reduces_per_host"); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(num_reduces); jobConf.setInputFormat(inputFormatClass); jobConf.setOutputFormat(outputFormatClass); jobConf.setOutputKeyClass(outputKeyClass); jobConf.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(jobConf, otherArgs.get(0)); FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.<K, V>writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/* w ww . ja va 2 s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key?? key = new Text(file.getName()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();// w w w . ja va2 s . com end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key? key = new Text(file.toString()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:cn.uway.util.apache.parquet.hadoop.ParquetFileWriter.java
License:Apache License
/** * @param configuration Hadoop configuration * @param schema the schema of the data//from www. j ava2 s . c om * @param file the file to write to * @param mode file creation mode * @param rowGroupSize the row group size * @throws IOException if the file can not be created */ public ParquetFileWriter(Configuration configuration, MessageType schema, Path file, Mode mode, long rowGroupSize, int maxPaddingSize) throws IOException { TypeUtil.checkValidWriteSchema(schema); this.schema = schema; FileSystem fs = file.getFileSystem(configuration); boolean overwriteFlag = (mode == Mode.OVERWRITE); if (supportsBlockSize(fs)) { // use the default block size, unless row group size is larger long dfsBlockSize = Math.max(fs.getDefaultBlockSize(file), rowGroupSize); this.alignment = PaddingAlignment.get(dfsBlockSize, rowGroupSize, maxPaddingSize); this.out = fs.create(file, overwriteFlag, DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(file), dfsBlockSize); } else { this.alignment = NoAlignment.get(rowGroupSize); this.out = fs.create(file, overwriteFlag); } }
From source file:cn.uway.util.apache.parquet.hadoop.ParquetFileWriter.java
License:Apache License
/** * FOR TESTING ONLY./* w w w . j a v a 2s . c o m*/ * * @param configuration Hadoop configuration * @param schema the schema of the data * @param file the file to write to * @param rowAndBlockSize the row group size * @throws IOException if the file can not be created */ ParquetFileWriter(Configuration configuration, MessageType schema, Path file, long rowAndBlockSize, int maxPaddingSize) throws IOException { FileSystem fs = file.getFileSystem(configuration); this.schema = schema; this.alignment = PaddingAlignment.get(rowAndBlockSize, rowAndBlockSize, maxPaddingSize); this.out = fs.create(file, true, DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(file), rowAndBlockSize); }
From source file:cn.uway.util.apache.parquet.hadoop.ParquetFileWriter.java
License:Apache License
/** * writes a _metadata and _common_metadata file * @param configuration the configuration to use to get the FileSystem * @param outputPath the directory to write the _metadata file to * @param footers the list of footers to merge * @throws IOException// w w w . j a v a 2 s . c o m */ public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers) throws IOException { FileSystem fs = outputPath.getFileSystem(configuration); outputPath = outputPath.makeQualified(fs); ParquetMetadata metadataFooter = mergeFooters(outputPath, footers); writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE); metadataFooter.getBlocks().clear(); writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE); }