List of usage examples for org.apache.hadoop.fs FileStatus getPath
public Path getPath()
From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java
License:Open Source License
/** * /*from ww w . j av a 2 s . c o m*/ * @param file * @return * @throws IOException */ public static Path[] getSequenceFilePaths(FileSystem fs, Path file) throws IOException { Path[] ret = null; if (fs.isDirectory(file)) { LinkedList<Path> tmp = new LinkedList<Path>(); FileStatus[] dStatus = fs.listStatus(file); for (FileStatus fdStatus : dStatus) if (!fdStatus.getPath().getName().startsWith("_")) //skip internal files tmp.add(fdStatus.getPath()); ret = tmp.toArray(new Path[0]); } else { ret = new Path[] { file }; } return ret; }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java
License:Open Source License
/** * // w w w. ja v a2 s . c om * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @param hasHeader * @param delim * @param fill * @param fillValue * @return * @throws IOException */ @SuppressWarnings("unchecked") private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException { ArrayList<Path> files = new ArrayList<Path>(); if (fs.isDirectory(path)) { for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); } else files.add(path); if (dest == null) { dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue); clen = dest.getNumColumns(); } boolean sparse = dest.isInSparseFormat(); ///////////////////////////////////////// String value = null; int row = 0; int col = -1; double cellValue = 0; long lnnz = 0; for (int fileNo = 0; fileNo < files.size(); fileNo++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); if (fileNo == 0 && hasHeader) br.readLine(); //ignore header // Read the data boolean emptyValuesFound = false; try { if (sparse) //SPARSE<-value { while ((value = br.readLine()) != null) //foreach line { String cellStr = value.toString().trim(); emptyValuesFound = false; String[] parts = IOUtilFunctions.split(cellStr, delim); col = 0; for (String part : parts) //foreach cell { part = part.trim(); if (part.isEmpty()) { emptyValuesFound = true; cellValue = fillValue; } else { cellValue = UtilFunctions.parseToDouble(part); } if (cellValue != 0) { dest.appendValue(row, col, cellValue); lnnz++; } col++; } //sanity checks for empty values and number of columns IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen); row++; } } else //DENSE<-value { while ((value = br.readLine()) != null) //foreach line { String cellStr = value.toString().trim(); emptyValuesFound = false; String[] parts = IOUtilFunctions.split(cellStr, delim); col = 0; for (String part : parts) //foreach cell { part = part.trim(); if (part.isEmpty()) { emptyValuesFound = true; cellValue = fillValue; } else { cellValue = UtilFunctions.parseToDouble(part); } if (cellValue != 0) { dest.setValueDenseUnsafe(row, col, cellValue); lnnz++; } col++; } //sanity checks for empty values and number of columns IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen); row++; } } } finally { IOUtilFunctions.closeSilently(br); } } //post processing dest.setNonZeros(lnnz); return dest; }
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java
License:Open Source License
/** * //from w ww . j a v a2 s . com * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ @SuppressWarnings("unchecked") public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path destFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (!_props.hasHeader()) { // simply move srcFile to destFile /* * TODO: Remove this roundabout way! * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv * & the only path that exists already on HDFS is /user/biadmin/csv/. * In this case: the directory structure /user/biadmin/csv/temp/out must be created. * Simple hdfs.rename() does not seem to create this directory structure. */ // delete the destination file, if exists already //boolean ret1 = hdfs.delete(destFilePath, true); // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created. //boolean ret2 = hdfs.createNewFile(destFilePath); // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/ //boolean ret3 = hdfs.delete(destFilePath, true); // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv //boolean ret4 = hdfs.rename(srcFilePath, destFilePath); //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4); return; } // construct the header line StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(_props.getDelim()); } sb.append('\n'); if (hdfs.isDirectory(srcFilePath)) { // compute sorted order among part files ArrayList<Path> files = new ArrayList<Path>(); for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); // first part file path Path firstpart = files.get(0); // create a temp file, and add header and contents of first part Path tmp = new Path(firstpart.toString() + ".tmp"); OutputStream out = hdfs.create(tmp, true); out.write(sb.toString().getBytes()); sb.setLength(0); // copy rest of the data from firstpart InputStream in = null; try { in = hdfs.open(firstpart); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } // rename tmp to firstpart hdfs.delete(firstpart, true); hdfs.rename(tmp, firstpart); // rename srcfile to destFile hdfs.delete(destFilePath, true); hdfs.createNewFile(destFilePath); // force the creation of directory structure hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure hdfs.rename(srcFilePath, destFilePath); // move the data } else if (hdfs.isFile(srcFilePath)) { // create destination file OutputStream out = hdfs.create(destFilePath, true); // write header out.write(sb.toString().getBytes()); sb.setLength(0); // copy the data from srcFile InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
/** * /*from w w w. j ava2s. co m*/ * @param context * @param fs * @param taskOutput * @throws IOException */ private void moveFinalTaskOutputs(TaskAttemptContext context, FileSystem fs, Path taskOutput) throws IOException { context.getProgressible().progress(); if (fs.getFileStatus(taskOutput).isDirectory()) { FileStatus[] files = fs.listStatus(taskOutput); if (files != null) for (FileStatus file : files) //for all files if (!file.isDirectory()) //skip directories moveFileToDestination(context, fs, file.getPath()); } }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java
License:Open Source License
private static void deletePartFiles(FileSystem fs, Path path) throws FileNotFoundException, IOException { PathFilter filter = new PathFilter() { public boolean accept(Path file) { return file.getName().startsWith("part-"); }/*from w w w . j a va2s .co m*/ }; FileStatus[] list = fs.listStatus(path, filter); for (FileStatus stat : list) { fs.delete(stat.getPath(), false); } }
From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java
License:Open Source License
/** * Helper function to fetch and sort the list of part files under the given * input directory./*w ww . ja v a2 s. c om*/ * * @param input * @param fs * @return * @throws FileNotFoundException * @throws IOException */ @SuppressWarnings("unchecked") private static ArrayList<Path> collectInputFiles(String input, FileSystem fs) throws FileNotFoundException, IOException { Path path = new Path(input); ArrayList<Path> files = new ArrayList<Path>(); if (fs.isDirectory(path)) { for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); } else files.add(path); return files; }
From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java
License:Open Source License
/** * //from w w w. j a v a 2 s . c om * @param dir * @return * @throws IOException */ public static String getSubDirs(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileStatus[] files = fs.listStatus(new Path(dir)); StringBuilder sb = new StringBuilder(); for (FileStatus file : files) { if (sb.length() > 0) sb.append(","); sb.append(file.getPath().toString()); } return sb.toString(); }
From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java
License:Open Source License
/** * /*from w w w . ja va 2 s. c o m*/ * @param dir * @return * @throws IOException */ public static String getSubDirsIgnoreLogs(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileStatus[] files = fs.listStatus(new Path(dir)); StringBuilder sb = new StringBuilder(); for (FileStatus file : files) { String name = file.getPath().toString(); if (name.contains("_logs")) continue; if (sb.length() > 0) sb.append(","); sb.append(name); } return sb.toString(); }
From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java
License:Open Source License
public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p, boolean average) throws IOException { long[] counts = metadata.getNumItemsArray(); long[] ranges = new long[counts.length]; ranges[0] = counts[0];/*from ww w.ja v a2s.c om*/ for (int i = 1; i < counts.length; i++) ranges[i] = ranges[i - 1] + counts[i]; long total = ranges[ranges.length - 1]; // do averaging only if it is asked for; and sum_wt is even average = average && (total % 2 == 0); int currentPart = 0; double cum_weight = 0; long pos = (long) Math.ceil(total * p); while (ranges[currentPart] < pos) { currentPart++; cum_weight += ranges[currentPart]; } int offset; if (currentPart > 0) offset = (int) (pos - ranges[currentPart - 1] - 1); else offset = (int) pos - 1; FileSystem fs = FileSystem.get(_rJob); Path path = new Path(dir); FileStatus[] files = fs.listStatus(path); Path fileToRead = null; for (FileStatus file : files) if (file.getPath().toString().endsWith(Integer.toString(currentPart))) { fileToRead = file.getPath(); break; } if (fileToRead == null) throw new RuntimeException("cannot read partition " + currentPart); FSDataInputStream currentStream = fs.open(fileToRead); DoubleWritable readKey = new DoubleWritable(); IntWritable readValue = new IntWritable(); boolean contain0s = false; long numZeros = 0; if (currentPart == metadata.getPartitionOfZero()) { contain0s = true; numZeros = metadata.getNumberOfZero(); } ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros); int numRead = 0; while (numRead <= offset) { reader.readNextKeyValuePairs(readKey, readValue); numRead += readValue.get(); cum_weight += readValue.get(); } double ret = readKey.get(); if (average) { if (numRead <= offset + 1) { reader.readNextKeyValuePairs(readKey, readValue); cum_weight += readValue.get(); ret = (ret + readKey.get()) / 2; } } currentStream.close(); return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) }; }
From source file:com.ibm.bi.dml.test.utils.TestUtils.java
License:Open Source License
/** * Compares contents of an expected file with the actual file, where rows may be permuted * @param expectedFile/* w w w . j a v a2 s. co m*/ * @param actualDir * @param epsilon */ public static void compareDMLMatrixWithJavaMatrixRowsOutOfOrder(String expectedFile, String actualDir, double epsilon) { try { FileSystem fs = FileSystem.get(conf); Path outDirectory = new Path(actualDir); Path compareFile = new Path(expectedFile); FSDataInputStream fsin = fs.open(compareFile); BufferedReader compareIn = new BufferedReader(new InputStreamReader(fsin)); HashMap<CellIndex, Double> expectedValues = new HashMap<CellIndex, Double>(); String line; while ((line = compareIn.readLine()) != null) { StringTokenizer st = new StringTokenizer(line, " "); int i = Integer.parseInt(st.nextToken()); int j = Integer.parseInt(st.nextToken()); double v = Double.parseDouble(st.nextToken()); expectedValues.put(new CellIndex(i, j), v); } compareIn.close(); HashMap<CellIndex, Double> actualValues = new HashMap<CellIndex, Double>(); FileStatus[] outFiles = fs.listStatus(outDirectory); for (FileStatus file : outFiles) { FSDataInputStream fsout = fs.open(file.getPath()); BufferedReader outIn = new BufferedReader(new InputStreamReader(fsout)); while ((line = outIn.readLine()) != null) { StringTokenizer st = new StringTokenizer(line, " "); int i = Integer.parseInt(st.nextToken()); int j = Integer.parseInt(st.nextToken()); double v = Double.parseDouble(st.nextToken()); actualValues.put(new CellIndex(i, j), v); } outIn.close(); } ArrayList<Double> e_list = new ArrayList<Double>(); for (CellIndex index : expectedValues.keySet()) { Double expectedValue = expectedValues.get(index); if (expectedValue != 0.0) e_list.add(expectedValue); } ArrayList<Double> a_list = new ArrayList<Double>(); for (CellIndex index : actualValues.keySet()) { Double actualValue = actualValues.get(index); if (actualValue != 0.0) a_list.add(actualValue); } Collections.sort(e_list); Collections.sort(a_list); assertTrue("Matrix nzs not equal", e_list.size() == a_list.size()); for (int i = 0; i < e_list.size(); i++) { assertTrue("Matrix values not equals", Math.abs(e_list.get(i) - a_list.get(i)) <= epsilon); } } catch (IOException e) { fail("unable to read file: " + e.getMessage()); } }