List of usage examples for org.apache.hadoop.fs FileSystem exists
public boolean exists(Path f) throws IOException
From source file:com.ibm.bi.dml.parser.DataExpression.java
License:Open Source License
public String[] readMatrixMarketFile(String filename, boolean conditional) throws LanguageException { String[] retVal = new String[2]; retVal[0] = new String(""); retVal[1] = new String(""); boolean exists = false; try {//from w ww . j av a2 s . com FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); Path pt = new Path(filename); if (fs.exists(pt)) { exists = true; } boolean getFileStatusIsDir = fs.getFileStatus(pt).isDirectory(); if (exists && getFileStatusIsDir) { raiseValidateError("MatrixMarket files as directories not supported", conditional); } else if (exists) { BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(pt))); try { retVal[0] = in.readLine(); // skip all commented lines do { retVal[1] = in.readLine(); } while (retVal[1].charAt(0) == '%'); if (!retVal[0].startsWith("%%")) { raiseValidateError("MatrixMarket files must begin with a header line.", conditional); } } finally { if (in != null) in.close(); } } else { raiseValidateError("Could not find the file: " + filename, conditional); } } catch (IOException e) { //LOG.error(this.printErrorLocation() + "Error reading MatrixMarket file: " + filename ); //throw new LanguageException(this.printErrorLocation() + "Error reading MatrixMarket file: " + filename ); throw new LanguageException(e); } return retVal; }
From source file:com.ibm.bi.dml.parser.DataExpression.java
License:Open Source License
public boolean checkHasMatrixMarketFormat(String inputFileName, String mtdFileName, boolean conditional) throws LanguageException { // Check the MTD file exists. if there is an MTD file, return false. JSONObject mtdObject = readMetadataFile(mtdFileName, conditional); if (mtdObject != null) return false; boolean exists = false; FileSystem fs = null; try {//w ww .jav a 2 s . c o m fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); } catch (Exception e) { LOG.error(this.printErrorLocation() + "could not read the configuration file."); throw new LanguageException(this.printErrorLocation() + "could not read the configuration file.", e); } Path pt = new Path(inputFileName); try { if (fs.exists(pt)) { exists = true; } } catch (Exception e) { LOG.error(this.printErrorLocation() + "file " + inputFileName + " not found"); throw new LanguageException(this.printErrorLocation() + "file " + inputFileName + " not found"); } try { // CASE: filename is a directory -- process as a directory if (exists && fs.getFileStatus(pt).isDirectory()) { // currently, only MM files as files are supported. So, if file is directory, then infer // likely not MM file return false; } // CASE: filename points to a file else if (exists) { //BufferedReader in = new BufferedReader(new FileReader(filename)); BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(pt))); String headerLine = new String(""); if (in.ready()) headerLine = in.readLine(); in.close(); // check that headerline starts with "%%" // will infer malformed if (headerLine != null && headerLine.startsWith("%%")) return true; else return false; } else { return false; } } catch (Exception e) { return false; } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.opt.OptimizerRuleBased.java
License:Open Source License
/** * Increasing the partition replication factor is beneficial if partitions are * read multiple times (e.g., in nested loops) because partitioning (done once) * gets slightly slower but there is a higher probability for local access * /*w ww. j a v a 2 s.c o m*/ * NOTE: this rewrite requires 'set data partitioner' to be executed in order to * leverage the partitioning information in the plan tree. * * @param n * @throws DMLRuntimeException */ protected void rewriteSetPartitionReplicationFactor(OptNode n, HashMap<String, PDataPartitionFormat> partitionedMatrices, LocalVariableMap vars) throws DMLRuntimeException { boolean apply = false; double sizeReplicated = 0; int replication = ParForProgramBlock.WRITE_REPLICATION_FACTOR; ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping() .getMappedProg(n.getID())[1]; if (n.getExecType() == ExecType.MR && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_MR.toString()) && n.hasNestedParallelism(false) && n.hasNestedPartitionReads(false)) { apply = true; //account for problem and cluster constraints replication = (int) Math.min(_N, _rnk); //account for internal max constraint (note hadoop will warn if max > 10) replication = (int) Math.min(replication, MAX_REPLICATION_FACTOR_EXPORT); //account for remaining hdfs capacity try { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); long hdfsCapacityRemain = fs.getStatus().getRemaining(); long sizeInputs = 0; //sum of all input sizes (w/o replication) for (String var : partitionedMatrices.keySet()) { MatrixObject mo = (MatrixObject) vars.get(var); Path fname = new Path(mo.getFileName()); if (fs.exists(fname)) //non-existing (e.g., CP) -> small file sizeInputs += fs.getContentSummary(fname).getLength(); } replication = (int) Math.min(replication, Math.floor(0.9 * hdfsCapacityRemain / sizeInputs)); //ensure at least replication 1 replication = Math.max(replication, ParForProgramBlock.WRITE_REPLICATION_FACTOR); sizeReplicated = replication * sizeInputs; } catch (Exception ex) { throw new DMLRuntimeException("Failed to analyze remaining hdfs capacity.", ex); } } //modify the runtime plan if (apply) pfpb.setPartitionReplicationFactor(replication); _numEvaluatedPlans++; LOG.debug(getOptMode() + " OPT: rewrite 'set partition replication factor' - result=" + apply + ((apply) ? " (" + replication + ", " + toMB(sizeReplicated) + ")" : "")); }
From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java
License:Open Source License
/** * //from ww w . ja va2 s . com * @param fs * @param path * @throws IOException */ protected static void checkValidInputFile(FileSystem fs, Path path) throws IOException { //check non-existing file if (!fs.exists(path)) throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS."); //check for empty file if (MapReduceTool.isFileEmpty(fs, path.toString())) throw new EOFException("Empty input file " + path.toString() + "."); }
From source file:com.ibm.bi.dml.runtime.io.WriterMatrixMarket.java
License:Open Source License
/** * /* w w w. j a v a 2 s.c o m*/ * @param srcFileName * @param fileName * @param rlen * @param clen * @param nnz * @throws IOException */ public void mergeTextcellToMatrixMarket(String srcFileName, String fileName, long rlen, long clen, long nnz) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path src = new Path(srcFileName); Path merge = new Path(fileName); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(merge)) { hdfs.delete(merge, true); } OutputStream out = hdfs.create(merge, true); // write out the header first StringBuilder sb = new StringBuilder(); sb.append("%%MatrixMarket matrix coordinate real general\n"); // output number of rows, number of columns and number of nnz sb.append(rlen + " " + clen + " " + nnz + "\n"); out.write(sb.toString().getBytes()); // if the source is a directory if (hdfs.getFileStatus(src).isDirectory()) { try { FileStatus[] contents = hdfs.listStatus(src); for (int i = 0; i < contents.length; i++) { if (!contents[i].isDirectory()) { InputStream in = hdfs.open(contents[i].getPath()); try { IOUtils.copyBytes(in, out, conf, false); } finally { IOUtilFunctions.closeSilently(in); } } } } finally { IOUtilFunctions.closeSilently(out); } } else if (hdfs.isFile(src)) { InputStream in = null; try { in = hdfs.open(src); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(src.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java
License:Open Source License
/** * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. * The part files are created by CSV_WRITE MR job. * // w w w .jav a2s . c om * This method is invoked from CP-write instruction. * * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ public void mergeCSVPartFiles(String srcFileName, String destFileName, CSVFileFormatProperties csvprop, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path mergedFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(mergedFilePath)) { hdfs.delete(mergedFilePath, true); } OutputStream out = hdfs.create(mergedFilePath, true); // write out the header, if needed if (csvprop.hasHeader()) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(csvprop.getDelim()); } sb.append('\n'); out.write(sb.toString().getBytes()); sb.setLength(0); } // if the source is a directory if (hdfs.isDirectory(srcFilePath)) { try { FileStatus[] contents = hdfs.listStatus(srcFilePath); Path[] partPaths = new Path[contents.length]; int numPartFiles = 0; for (int i = 0; i < contents.length; i++) { if (!contents[i].isDirectory()) { partPaths[i] = contents[i].getPath(); numPartFiles++; } } Arrays.sort(partPaths); for (int i = 0; i < numPartFiles; i++) { InputStream in = hdfs.open(partPaths[i]); try { IOUtils.copyBytes(in, out, conf, false); if (i < numPartFiles - 1) out.write('\n'); } finally { IOUtilFunctions.closeSilently(in); } } } finally { IOUtilFunctions.closeSilently(out); } } else if (hdfs.isFile(srcFilePath)) { InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
@Override public void cleanupJob(JobContext context) throws IOException { JobConf conf = context.getJobConf(); // do the clean up of temporary directory Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { FileSystem fs = outputPath.getFileSystem(conf); context.getProgressible().progress(); if (fs.exists(outputPath)) fs.delete(outputPath, true); }//from www. j a va 2 s . com }
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
@Override public void commitTask(TaskAttemptContext context) throws IOException { JobConf conf = context.getJobConf(); TaskAttemptID attemptId = context.getTaskAttemptID(); // get the mapping between index to output filename outputs = MRJobConfiguration.getOutputs(conf); //get temp task output path (compatible with hadoop1 and hadoop2) Path taskOutPath = FileOutputFormat.getWorkOutputPath(conf); FileSystem fs = taskOutPath.getFileSystem(conf); if (!fs.exists(taskOutPath)) throw new IOException("Task output path " + taskOutPath.toString() + "does not exist."); // Move the task outputs to their final places context.getProgressible().progress(); moveFinalTaskOutputs(context, fs, taskOutPath); // Delete the temporary task-specific output directory if (!fs.delete(taskOutPath, true)) LOG.debug(// w ww . j a v a 2 s .co m "Failed to delete the temporary output directory of task: " + attemptId + " - " + taskOutPath); }
From source file:com.ibm.bi.dml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Open Source License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample/*from w ww . j a va 2 s . co m*/ * @param partFile where to write the output file to * @throws IOException if something goes wrong * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; ++i) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int index0 = -1, i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; writer.close(); return index0; }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Helper function to move transformation metadata files from a temporary * location to permanent location. These files (e.g., header before and * after transformation) are generated by a single mapper, while applying * data transformations. Note that, these files must be ultimately be placed * under the existing metadata directory (txMtdPath), which is * simultaneously read by other mappers. If they are not created at a * temporary location, then MR tasks fail due to changing timestamps on * txMtdPath./*from ww w .j ava 2s .c o m*/ * * @param fs * @param tmpPath * @param txMtdPath * @throws IllegalArgumentException * @throws IOException */ private static void moveFilesFromTmp(FileSystem fs, String tmpPath, String txMtdPath) throws IllegalArgumentException, IOException { // move files from temporary location to txMtdPath MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.OUT_HEADER, txMtdPath + "/" + TransformationAgent.OUT_HEADER); MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.OUT_DCD_HEADER, txMtdPath + "/" + TransformationAgent.OUT_DCD_HEADER); MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.COLTYPES_FILE_NAME, txMtdPath + "/" + TransformationAgent.COLTYPES_FILE_NAME); if (fs.exists(new Path(tmpPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME))) { if (!fs.exists(new Path(txMtdPath + "/Dummycode/"))) fs.mkdirs(new Path(txMtdPath + "/Dummycode/")); MapReduceTool.renameFileOnHDFS(tmpPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME, txMtdPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME); } }