List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.cloudera.knittingboar.sgd.POLRMasterDriver.java
License:Apache License
/** * [ needs to be checked ]/* w w w . j a va 2 s . c o m*/ * * NOTE: This should only be used for durability purposes in checkpointing the * workers * * @param outputFilename * @param conf * @throws Exception */ public void SaveModelToHDFS(String outputFilename, Configuration conf) throws Exception { Path path = new Path(outputFilename); FileSystem fs = path.getFileSystem(conf); FSDataOutputStream modelHDFSOutput = fs.create(path, true); try { polr_modelparams.saveTo(modelHDFSOutput); } finally { modelHDFSOutput.close(); } }
From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java
License:Apache License
/** * @param args the cli arguments// w w w. j av a 2 s. c o m */ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); if (args.length != 2) { usage(); return 2; } setNumberOfRows(job, parseHumanLong(args[0])); Path outputDir = new Path(args[1]); if (outputDir.getFileSystem(getConf()).exists(outputDir)) { throw new IOException("Output directory " + outputDir + " already exists."); } FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraGen"); job.setJarByClass(TeraGen.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RangeInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample/*from www .ja va 2s .c o m*/ * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } @Override public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java
License:Apache License
@Override public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException { Path file = getDefaultWorkFile(job, ""); FileSystem fs = file.getFileSystem(job.getConfiguration()); FSDataOutputStream fileOut = fs.create(file); return new TeraRecordWriter(fileOut, job); }
From source file:com.cloudera.recordservice.mapreduce.testapps.RecordCount.java
License:Apache License
public static long countRecords(String path) throws IOException { String output = TestUtil.getTempDirectory(); Path inputPath = new Path(path); Path outputPath = new Path(output); JobConf conf = new JobConf(RecordCount.class); conf.setJobName("recordcount"); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setInt("mapreduce.job.reduces", 1); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); JobClient.runJob(conf);//w w w .j a v a2 s .c o m // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. FileSystem fs = outputPath.getFileSystem(conf); FSDataInputStream resultStream = fs.open(new Path(output + "/part-00000")); byte[] bytes = new byte[16]; int length = resultStream.read(bytes); String result = new String(bytes, 0, length).trim(); return Long.parseLong(result); }
From source file:com.cloudera.recordservice.pig.HCatRSLoader.java
License:Apache License
/** * A utility method to get the size of inputs. This is accomplished by summing the * size of all input paths on supported FileSystems. Locations whose size cannot be * determined are ignored. Note non-FileSystem and unpartitioned locations will not * report their input size by default. This method was copied from HcatBaseLoader to use * the Record Service InputJobInfo./* ww w . ja va 2 s. c om*/ */ protected static long getSizeInBytes(InputJobInfo inputJobInfo) throws IOException { Configuration conf = new Configuration(); long sizeInBytes = 0; for (PartInfo partInfo : inputJobInfo.getPartitions()) { try { Path p = new Path(partInfo.getLocation()); if (p.getFileSystem(conf).isFile(p)) { sizeInBytes += p.getFileSystem(conf).getFileStatus(p).getLen(); } else { FileStatus[] fileStatuses = p.getFileSystem(conf).listStatus(p); if (fileStatuses != null) { for (FileStatus child : fileStatuses) { sizeInBytes += child.getLen(); } } } } catch (IOException e) { // Report size to the extent possible. } } LOG.info("SIZE:" + sizeInBytes + "\n\n"); return sizeInBytes; }
From source file:com.cloudera.sa.ExcelRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; Configuration conf = context.getConfiguration(); Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); this.in = fs.open(file); XSSFWorkbook workbook = new XSSFWorkbook(this.in); XSSFSheet sheet = workbook.getSheetAt(0); this.totalRows = sheet.getPhysicalNumberOfRows(); this.processedRows = 0; this.rowIterator = sheet.rowIterator(); }
From source file:com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat.java
License:Open Source License
@Override public RecordWriter<Text, Text> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { if (schema == null) { SchemaLoader loader = new SchemaLoader(job); this.schema = loader.load(job.get(SCHEMA_LITERAL), job.get(SCHEMA_URL), job.get(SCHEMA_TYPE_NAME)); this.converter = new JsonConverter(schema); this.readKey = job.getBoolean(READ_KEY, true); }/*ww w. ja v a 2 s.co m*/ DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(schema)); if (getCompressOutput(job)) { int level = job.getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, AvroOutputFormat.DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.CONF_OUTPUT_CODEC, org.apache.avro.file.DataFileConstants.DEFLATE_CODEC); CodecFactory codec = codecName.equals(DataFileConstants.DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(codec); } writer.setSyncInterval( job.getInt(AvroOutputFormat.SYNC_INTERVAL_KEY, DataFileConstants.DEFAULT_SYNC_INTERVAL)); Path path = FileOutputFormat.getTaskOutputPath(job, name + AvroOutputFormat.EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new AvroAsJSONRecordWriter(writer, converter, readKey); }
From source file:com.cloudera.science.quince.FileUtils.java
License:Open Source License
public static Path[] findVcfs(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); if (fs.isDirectory(path)) { FileStatus[] fileStatuses = fs.listStatus(path, new HiddenPathFilter()); Path[] vcfs = new Path[fileStatuses.length]; int i = 0; for (FileStatus status : fileStatuses) { vcfs[i++] = status.getPath(); }/* ww w.ja va 2 s . c om*/ return vcfs; } else { return new Path[] { path }; } }
From source file:com.cloudera.science.quince.FileUtils.java
License:Open Source License
public static boolean sampleGroupExists(Path path, Configuration conf, String sampleGroup) throws IOException { FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { return false; }//from w ww. j av a 2 s . c o m for (FileStatus chrStatus : fs.listStatus(path, new PartitionPathFilter("chr"))) { for (FileStatus posStatus : fs.listStatus(chrStatus.getPath(), new PartitionPathFilter("pos"))) { if (fs.listStatus(posStatus.getPath(), new PartitionPathFilter("sample_group", sampleGroup)).length > 0) { return true; } } } return false; }