Example usage for org.apache.hadoop.fs FileStatus getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getPath.

Prototype

public Path getPath()

Source Link

Usage

From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java

License:Open Source License

/**
 * /*from   ww w .  j  av a  2 s  .  c  o m*/
 * @param file
 * @return
 * @throws IOException
 */
public static Path[] getSequenceFilePaths(FileSystem fs, Path file) throws IOException {
    Path[] ret = null;

    if (fs.isDirectory(file)) {
        LinkedList<Path> tmp = new LinkedList<Path>();
        FileStatus[] dStatus = fs.listStatus(file);
        for (FileStatus fdStatus : dStatus)
            if (!fdStatus.getPath().getName().startsWith("_")) //skip internal files
                tmp.add(fdStatus.getPath());
        ret = tmp.toArray(new Path[0]);
    } else {
        ret = new Path[] { file };
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java

License:Open Source License

/**
 * // w  w  w. ja v a2 s  . c  om
 * @param path
 * @param job
 * @param fs
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param hasHeader
 * @param delim
 * @param fill
 * @param fillValue
 * @return
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen,
        long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue)
        throws IOException {
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    if (dest == null) {
        dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue);
        clen = dest.getNumColumns();
    }

    boolean sparse = dest.isInSparseFormat();

    /////////////////////////////////////////
    String value = null;
    int row = 0;
    int col = -1;
    double cellValue = 0;
    long lnnz = 0;

    for (int fileNo = 0; fileNo < files.size(); fileNo++) {
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
        if (fileNo == 0 && hasHeader)
            br.readLine(); //ignore header

        // Read the data
        boolean emptyValuesFound = false;
        try {
            if (sparse) //SPARSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.appendValue(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            } else //DENSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.setValueDenseUnsafe(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(br);
        }
    }

    //post processing
    dest.setNonZeros(lnnz);

    return dest;
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * //from  w  ww . j  a v a2 s  . com
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
@SuppressWarnings("unchecked")
public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
        // simply move srcFile to destFile

        /*
         * TODO: Remove this roundabout way! 
         * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv 
         *              & the only path that exists already on HDFS is /user/biadmin/csv/.
         * In this case: the directory structure /user/biadmin/csv/temp/out must be created. 
         * Simple hdfs.rename() does not seem to create this directory structure.
         */

        // delete the destination file, if exists already
        //boolean ret1 = 
        hdfs.delete(destFilePath, true);

        // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
        //boolean ret2 = 
        hdfs.createNewFile(destFilePath);

        // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
        //boolean ret3 = 
        hdfs.delete(destFilePath, true);

        // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
        //boolean ret4 = 
        hdfs.rename(srcFilePath, destFilePath);

        //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4);
        return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1)
            sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

        // compute sorted order among part files
        ArrayList<Path> files = new ArrayList<Path>();
        for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);

        // first part file path
        Path firstpart = files.get(0);

        // create a temp file, and add header and contents of first part
        Path tmp = new Path(firstpart.toString() + ".tmp");
        OutputStream out = hdfs.create(tmp, true);
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy rest of the data from firstpart
        InputStream in = null;
        try {
            in = hdfs.open(firstpart);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }

        // rename tmp to firstpart
        hdfs.delete(firstpart, true);
        hdfs.rename(tmp, firstpart);

        // rename srcfile to destFile
        hdfs.delete(destFilePath, true);
        hdfs.createNewFile(destFilePath); // force the creation of directory structure
        hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
        hdfs.rename(srcFilePath, destFilePath); // move the data 

    } else if (hdfs.isFile(srcFilePath)) {
        // create destination file
        OutputStream out = hdfs.create(destFilePath, true);

        // write header
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy the data from srcFile
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java

License:Open Source License

/**
 * /*from   w w w.  j ava2s.  co  m*/
 * @param context
 * @param fs
 * @param taskOutput
 * @throws IOException
 */
private void moveFinalTaskOutputs(TaskAttemptContext context, FileSystem fs, Path taskOutput)
        throws IOException {
    context.getProgressible().progress();

    if (fs.getFileStatus(taskOutput).isDirectory()) {
        FileStatus[] files = fs.listStatus(taskOutput);
        if (files != null)
            for (FileStatus file : files) //for all files
                if (!file.isDirectory()) //skip directories
                    moveFileToDestination(context, fs, file.getPath());
    }
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java

License:Open Source License

private static void deletePartFiles(FileSystem fs, Path path) throws FileNotFoundException, IOException {
    PathFilter filter = new PathFilter() {
        public boolean accept(Path file) {
            return file.getName().startsWith("part-");
        }/*from   w w w  .  j  a  va2s .co m*/
    };
    FileStatus[] list = fs.listStatus(path, filter);
    for (FileStatus stat : list) {
        fs.delete(stat.getPath(), false);
    }
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Helper function to fetch and sort the list of part files under the given
 * input directory./*w  ww .  ja v a2 s.  c om*/
 * 
 * @param input
 * @param fs
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private static ArrayList<Path> collectInputFiles(String input, FileSystem fs)
        throws FileNotFoundException, IOException {
    Path path = new Path(input);
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    return files;
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

/**
 * //from  w w  w. j  a  v  a  2  s . c  om
 * @param dir
 * @return
 * @throws IOException
 */
public static String getSubDirs(String dir) throws IOException {
    FileSystem fs = FileSystem.get(_rJob);
    FileStatus[] files = fs.listStatus(new Path(dir));
    StringBuilder sb = new StringBuilder();
    for (FileStatus file : files) {
        if (sb.length() > 0)
            sb.append(",");
        sb.append(file.getPath().toString());
    }
    return sb.toString();
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

/**
 * /*from w w w .  ja va  2  s. c o  m*/
 * @param dir
 * @return
 * @throws IOException
 */
public static String getSubDirsIgnoreLogs(String dir) throws IOException {
    FileSystem fs = FileSystem.get(_rJob);
    FileStatus[] files = fs.listStatus(new Path(dir));
    StringBuilder sb = new StringBuilder();
    for (FileStatus file : files) {
        String name = file.getPath().toString();
        if (name.contains("_logs"))
            continue;
        if (sb.length() > 0)
            sb.append(",");
        sb.append(name);
    }
    return sb.toString();
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p,
        boolean average) throws IOException {
    long[] counts = metadata.getNumItemsArray();
    long[] ranges = new long[counts.length];
    ranges[0] = counts[0];/*from ww w.ja  v a2s.c om*/
    for (int i = 1; i < counts.length; i++)
        ranges[i] = ranges[i - 1] + counts[i];

    long total = ranges[ranges.length - 1];

    // do averaging only if it is asked for; and sum_wt is even
    average = average && (total % 2 == 0);

    int currentPart = 0;
    double cum_weight = 0;
    long pos = (long) Math.ceil(total * p);
    while (ranges[currentPart] < pos) {
        currentPart++;
        cum_weight += ranges[currentPart];
    }
    int offset;
    if (currentPart > 0)
        offset = (int) (pos - ranges[currentPart - 1] - 1);
    else
        offset = (int) pos - 1;

    FileSystem fs = FileSystem.get(_rJob);
    Path path = new Path(dir);
    FileStatus[] files = fs.listStatus(path);
    Path fileToRead = null;
    for (FileStatus file : files)
        if (file.getPath().toString().endsWith(Integer.toString(currentPart))) {
            fileToRead = file.getPath();
            break;
        }

    if (fileToRead == null)
        throw new RuntimeException("cannot read partition " + currentPart);

    FSDataInputStream currentStream = fs.open(fileToRead);
    DoubleWritable readKey = new DoubleWritable();
    IntWritable readValue = new IntWritable();

    boolean contain0s = false;
    long numZeros = 0;
    if (currentPart == metadata.getPartitionOfZero()) {
        contain0s = true;
        numZeros = metadata.getNumberOfZero();
    }
    ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros);

    int numRead = 0;
    while (numRead <= offset) {
        reader.readNextKeyValuePairs(readKey, readValue);
        numRead += readValue.get();
        cum_weight += readValue.get();
    }

    double ret = readKey.get();
    if (average) {
        if (numRead <= offset + 1) {
            reader.readNextKeyValuePairs(readKey, readValue);
            cum_weight += readValue.get();
            ret = (ret + readKey.get()) / 2;
        }
    }
    currentStream.close();
    return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) };
}

From source file:com.ibm.bi.dml.test.utils.TestUtils.java

License:Open Source License

/**
 * Compares contents of an expected file with the actual file, where rows may be permuted
 * @param expectedFile/* w  w w  .  j a v a2 s.  co  m*/
 * @param actualDir
 * @param epsilon
 */
public static void compareDMLMatrixWithJavaMatrixRowsOutOfOrder(String expectedFile, String actualDir,
        double epsilon) {
    try {
        FileSystem fs = FileSystem.get(conf);
        Path outDirectory = new Path(actualDir);
        Path compareFile = new Path(expectedFile);
        FSDataInputStream fsin = fs.open(compareFile);
        BufferedReader compareIn = new BufferedReader(new InputStreamReader(fsin));

        HashMap<CellIndex, Double> expectedValues = new HashMap<CellIndex, Double>();
        String line;
        while ((line = compareIn.readLine()) != null) {
            StringTokenizer st = new StringTokenizer(line, " ");
            int i = Integer.parseInt(st.nextToken());
            int j = Integer.parseInt(st.nextToken());
            double v = Double.parseDouble(st.nextToken());
            expectedValues.put(new CellIndex(i, j), v);
        }
        compareIn.close();

        HashMap<CellIndex, Double> actualValues = new HashMap<CellIndex, Double>();

        FileStatus[] outFiles = fs.listStatus(outDirectory);

        for (FileStatus file : outFiles) {
            FSDataInputStream fsout = fs.open(file.getPath());
            BufferedReader outIn = new BufferedReader(new InputStreamReader(fsout));

            while ((line = outIn.readLine()) != null) {
                StringTokenizer st = new StringTokenizer(line, " ");
                int i = Integer.parseInt(st.nextToken());
                int j = Integer.parseInt(st.nextToken());
                double v = Double.parseDouble(st.nextToken());
                actualValues.put(new CellIndex(i, j), v);
            }
            outIn.close();
        }

        ArrayList<Double> e_list = new ArrayList<Double>();
        for (CellIndex index : expectedValues.keySet()) {
            Double expectedValue = expectedValues.get(index);
            if (expectedValue != 0.0)
                e_list.add(expectedValue);
        }

        ArrayList<Double> a_list = new ArrayList<Double>();
        for (CellIndex index : actualValues.keySet()) {
            Double actualValue = actualValues.get(index);
            if (actualValue != 0.0)
                a_list.add(actualValue);
        }

        Collections.sort(e_list);
        Collections.sort(a_list);

        assertTrue("Matrix nzs not equal", e_list.size() == a_list.size());
        for (int i = 0; i < e_list.size(); i++) {
            assertTrue("Matrix values not equals", Math.abs(e_list.get(i) - a_list.get(i)) <= epsilon);
        }

    } catch (IOException e) {
        fail("unable to read file: " + e.getMessage());
    }
}