List of usage examples for org.apache.hadoop.fs FileSystem makeQualified
public Path makeQualified(Path path)
From source file:com.ngdata.hbaseindexer.mr.HBaseMapReduceIndexerToolTest.java
License:Apache License
@Test public void testIndexer_StartAndEndRows() throws Exception { FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration()); MR_TEST_UTIL.runTool("--hbase-indexer-file", new File(Resources.getResource(getClass(), "user_indexer.xml").toURI()).toString(), "--solr-home-dir", MINIMR_CONF_DIR.toString(), "--output-dir", fs.makeQualified(new Path("/solroutput")).toString(), "--shards", "1", "--hbase-start-row", "row0100", "--hbase-end-row", "row1000", "--max-segments", "2", "--overwrite-output-dir"); ForkedTestUtils.validateSolrServerDocumentCount(MINIMR_CONF_DIR, FileSystem.get(HBASE_TEST_UTILITY.getConfiguration()), new Path("/solroutput", "results"), 900, 1); }
From source file:com.phantom.hadoop.examples.BaileyBorweinPlouffe.java
License:Apache License
/** Run a map/reduce job to compute Pi. */ private static void compute(int startDigit, int nDigits, int nMaps, String workingDir, Configuration conf, PrintStream out) throws IOException { final String name = startDigit + "_" + nDigits; // setup wroking directory out.println("Working Directory = " + workingDir); out.println();//from w w w. j a v a 2s . c o m final FileSystem fs = FileSystem.get(conf); final Path dir = fs.makeQualified(new Path(workingDir)); if (fs.exists(dir)) { throw new IOException("Working directory " + dir + " already exists. Please remove it first."); } else if (!fs.mkdirs(dir)) { throw new IOException("Cannot create working directory " + dir); } out.println("Start Digit = " + startDigit); out.println("Number of Digits = " + nDigits); out.println("Number of Maps = " + nMaps); // setup a job final Job job = createJob(name, conf); final Path hexfile = new Path(dir, "pi_" + name + ".hex"); FileOutputFormat.setOutputPath(job, new Path(dir, "out")); // setup custom properties job.getConfiguration().set(WORKING_DIR_PROPERTY, dir.toString()); job.getConfiguration().set(HEX_FILE_PROPERTY, hexfile.toString()); job.getConfiguration().setInt(DIGIT_START_PROPERTY, startDigit); job.getConfiguration().setInt(DIGIT_SIZE_PROPERTY, nDigits); job.getConfiguration().setInt(DIGIT_PARTS_PROPERTY, nMaps); // start a map/reduce job out.println("\nStarting Job ..."); final long startTime = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { out.println("Job failed."); System.exit(1); } } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; out.println("Duration is " + duration + " seconds."); } out.println("Output file: " + hexfile); }
From source file:com.phantom.hadoop.examples.pi.DistSum.java
License:Apache License
/** Start a job to compute sigma */ private void compute(final String name, Summation sigma) throws IOException { if (sigma.getValue() != null) throw new IOException("sigma.getValue() != null, sigma=" + sigma); // setup remote directory final FileSystem fs = FileSystem.get(getConf()); final Path dir = fs.makeQualified(new Path(parameters.remoteDir, name)); if (!Util.createNonexistingDirectory(fs, dir)) return;//from w w w. j a va2s.c om // setup a job final Job job = createJob(name, sigma); final Path outdir = new Path(dir, "out"); FileOutputFormat.setOutputPath(job, outdir); // start a map/reduce job final String startmessage = "steps/parts = " + sigma.E.getSteps() + "/" + parameters.nParts + " = " + Util.long2string(sigma.E.getSteps() / parameters.nParts); Util.runJob(name, job, parameters.machine, startmessage, timer); final List<TaskResult> results = Util.readJobOutputs(fs, outdir); Util.writeResults(name, results, fs, parameters.remoteDir); fs.delete(dir, true); // combine results final List<TaskResult> combined = Util.combine(results); final PrintWriter out = Util.createWriter(parameters.localDir, name); try { for (TaskResult r : combined) { final String s = taskResult2string(name, r); out.println(s); out.flush(); Util.out.println(s); } } finally { out.close(); } if (combined.size() == 1) { final Summation s = combined.get(0).getElement(); if (sigma.contains(s) && s.contains(sigma)) sigma.setValue(s.getValue()); } }
From source file:com.phantom.hadoop.examples.QuasiMonteCarlo.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi/*from www . j av a 2 s . c om*/ */ public static BigDecimal estimatePi(int numMaps, long numPoints, Path tmpDir, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf); // setup job conf job.setJobName(QuasiMonteCarlo.class.getSimpleName()); job.setJarByClass(QuasiMonteCarlo.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(LongWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(QmcMapper.class); job.setReducerClass(QmcReducer.class); job.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. job.setSpeculativeExecution(false); // setup input/output directories final Path inDir = new Path(tmpDir, "in"); final Path outDir = new Path(tmpDir, "out"); FileInputFormat.setInputPaths(job, inDir); FileOutputFormat.setOutputPath(job, outDir); final FileSystem fs = FileSystem.get(conf); if (fs.exists(tmpDir)) { throw new IOException( "Tmp directory " + fs.makeQualified(tmpDir) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { // generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } // start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); job.waitForCompletion(true); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + duration + " seconds"); // read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf); try { reader.next(numInside, numOutside); } finally { reader.close(); } // compute estimated value final BigDecimal numTotal = BigDecimal.valueOf(numMaps).multiply(BigDecimal.valueOf(numPoints)); return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())).divide(numTotal, RoundingMode.HALF_UP); } finally { fs.delete(tmpDir, true); } }
From source file:com.skp.experiment.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Generates the fList from the serialized string representation * /*from w w w. j av a 2s . co m*/ * @return Deserialized Feature Frequency List */ public static List<Pair<String, Long>> readFList(Configuration conf) throws IOException { List<Pair<String, Long>> list = new ArrayList<Pair<String, Long>>(); Path[] files = DistributedCache.getLocalCacheFiles(conf); if (files == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } if (files.length != 1) { throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")"); } FileSystem fs = FileSystem.getLocal(conf); Path fListLocalPath = fs.makeQualified(files[0]); // Fallback if we are running locally. if (!fs.exists(fListLocalPath)) { URI[] filesURIs = DistributedCache.getCacheFiles(conf); if (filesURIs == null) { throw new IOException("Cannot read Frequency list from Distributed Cache"); } if (filesURIs.length != 1) { throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")"); } fListLocalPath = new Path(filesURIs[0].getPath()); } for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath, true, conf)) { list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get())); } return list; }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java
License:Apache License
@Override public List<ConfigIssue> init() { List<ConfigIssue> issues = super.init(); validateHadoopFS(issues);/* w w w. j a va 2 s.co m*/ // This is for getting no of splits - no of executors hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) { hadoopConf.set(config.getKey(), config.getValue()); } List<Path> hdfsDirPaths = new ArrayList<>(); if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_18)); } else if (issues.isEmpty()) { for (String hdfsDirLocation : hdfsDirLocations) { try { FileSystem fs = getFileSystemForInitDestroy(); Path ph = fs.makeQualified(new Path(hdfsDirLocation)); hdfsDirPaths.add(ph); if (!fs.exists(ph)) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_10, hdfsDirLocation)); } else if (!fs.getFileStatus(ph).isDirectory()) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_15, hdfsDirLocation)); } else { try { FileStatus[] files = fs.listStatus(ph); if (files == null || files.length == 0) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation)); } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) { for (FileStatus fileStatus : files) { if (fileStatus.isFile()) { String path = fileStatus.getPath().toString(); try { List<Map.Entry> buffer; if (dataFormat == DataFormat.AVRO) { buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE); } else { buffer = previewTextBatch(fileStatus, PREVIEW_SIZE); } for (int i = 0; i < buffer.size() && previewBuffer.size() < PREVIEW_SIZE; i++) { Map.Entry entry = buffer.get(i); previewBuffer.put(String.valueOf(entry.getKey()), entry.getValue() == null ? null : entry.getValue()); } } catch (IOException | InterruptedException ex) { String msg = "Error opening " + path + ": " + ex; LOG.info(msg, ex); issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath())); } } } } } catch (IOException ex) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex)); } } } catch (IOException ioe) { LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe); issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe)); } } } hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ",")); hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive)); switch (dataFormat) { case JSON: if (jsonMaxObjectLen < 1) { issues.add( getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04)); } break; case TEXT: if (textMaxLineLen < 1) { issues.add( getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05)); } break; case LOG: logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine, customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat, log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(), getFieldPathToGroupMap(fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, getContext()); break; case DELIMITED: if (csvMaxObjectLen < 1) { issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen", Errors.HADOOPFS_30)); } break; case AVRO: if (avroSchema != null && !avroSchema.isEmpty()) { hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema); hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema); } break; default: issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06, dataFormat)); } validateParserFactoryConfigs(issues); LOG.info("Issues: " + issues); return issues; }
From source file:com.test.PiEstimatorKrb.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi//w w w . j ava2 s.co m */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { //setup job conf jobConf.setJobName(PiEstimatorKrb.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { //generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } sLogger.info("Wrote input for Map #" + i); } //start a map/reduce job sLogger.info("Starting Job"); final long startTime = System.currentTimeMillis(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; sLogger.info("Job Finished in " + duration + " seconds"); //read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } //compute estimated value return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
From source file:com.twitter.algebra.matrix.format.Sequence2MatrixFormatJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "seq2mtx"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(Sequence2MatrixFormatJob.class); job.setJobName(Sequence2MatrixFormatJob.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(0);/*from ww w . j av a 2 s.co m*/ job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.ABInnerHDFSBroadcastOfB.java
License:Apache License
/** * Perform A x B, where A and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat} Refer to {@link ABInnerHDFSBroadcastOfB} * for further details./* w w w . j a v a 2 s .co m*/ * * @param conf the initial configuration * @param matrixInputPath path to matrix A * @param inMemMatrixDir path to matrix B (must be small enough to fit into * memory) * @param matrixOutputPath path to which AxB will be written * @param inMemMatrixNumRows B rows * @param inMemMatrixNumCols B cols * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path matrixInputPath, String inMemMatrixDir, Path matrixOutputPath, int inMemMatrixNumRows, int inMemMatrixNumCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "axbinner"); conf.set(MATRIXINMEMORY, inMemMatrixDir); conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows); conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ABInnerHDFSBroadcastOfB.class); job.setJobName(ABInnerHDFSBroadcastOfB.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); // since we do not use reducer, to get total order, the map output files has // to be renamed after this function returns: {@link // AlgebraCommon#fixPartitioningProblem} job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.matrix.multiply.ABOuterHDFSBroadcastOfA.java
License:Apache License
/** * Perform A x B, where A and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat} Refer to {@link ABOuterHDFSBroadcastOfA} * for further details./*from w w w . j av a 2 s . c o m*/ * * @param conf * the initial configuration * @param matrixInputPath * path to matrix A * @param inMemMatrixDir * path to matrix B (must be small enough to fit into memory) * @param matrixOutputPath * path to which AxB will be written * @param inMemMatrixNumRows * B rows * @param inMemMatrixNumCols * B cols * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, String inMemMatrixDir, Path matrixInputPath, Path matrixOutputPath, int inMemMatrixNumRows, int inMemMatrixNumCols) throws IOException, InterruptedException, ClassNotFoundException { conf.set(MATRIXINMEMORY, inMemMatrixDir); conf.setInt(MATRIXINMEMORYROWS, inMemMatrixNumRows); conf.setInt(MATRIXINMEMORYCOLS, inMemMatrixNumCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ABOuterHDFSBroadcastOfA.class); job.setJobName(ABOuterHDFSBroadcastOfA.class.getSimpleName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, inMemMatrixNumRows); job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); job.setReducerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }