Example usage for org.apache.hadoop.fs FileStatus getPath

List of usage examples for org.apache.hadoop.fs FileStatus getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getPath.

Prototype

public Path getPath() 

Source Link

Usage

From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java

License:Open Source License

/**
 * /*from   ww w .  j  av a  2 s  .  c  o m*/
 * @param file
 * @return
 * @throws IOException
 */
public static Path[] getSequenceFilePaths(FileSystem fs, Path file) throws IOException {
    Path[] ret = null;

    if (fs.isDirectory(file)) {
        LinkedList<Path> tmp = new LinkedList<Path>();
        FileStatus[] dStatus = fs.listStatus(file);
        for (FileStatus fdStatus : dStatus)
            if (!fdStatus.getPath().getName().startsWith("_")) //skip internal files
                tmp.add(fdStatus.getPath());
        ret = tmp.toArray(new Path[0]);
    } else {
        ret = new Path[] { file };
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java

License:Open Source License

/**
 * // w  w  w. ja v a2 s  . c  om
 * @param path
 * @param job
 * @param fs
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param hasHeader
 * @param delim
 * @param fill
 * @param fillValue
 * @return
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen,
        long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue)
        throws IOException {
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    if (dest == null) {
        dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue);
        clen = dest.getNumColumns();
    }

    boolean sparse = dest.isInSparseFormat();

    /////////////////////////////////////////
    String value = null;
    int row = 0;
    int col = -1;
    double cellValue = 0;
    long lnnz = 0;

    for (int fileNo = 0; fileNo < files.size(); fileNo++) {
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
        if (fileNo == 0 && hasHeader)
            br.readLine(); //ignore header

        // Read the data
        boolean emptyValuesFound = false;
        try {
            if (sparse) //SPARSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.appendValue(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            } else //DENSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.setValueDenseUnsafe(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(br);
        }
    }

    //post processing
    dest.setNonZeros(lnnz);

    return dest;
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * //from  w  ww . j  a v a2 s  . com
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
@SuppressWarnings("unchecked")
public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
        // simply move srcFile to destFile

        /*
         * TODO: Remove this roundabout way! 
         * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv 
         *              & the only path that exists already on HDFS is /user/biadmin/csv/.
         * In this case: the directory structure /user/biadmin/csv/temp/out must be created. 
         * Simple hdfs.rename() does not seem to create this directory structure.
         */

        // delete the destination file, if exists already
        //boolean ret1 = 
        hdfs.delete(destFilePath, true);

        // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
        //boolean ret2 = 
        hdfs.createNewFile(destFilePath);

        // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
        //boolean ret3 = 
        hdfs.delete(destFilePath, true);

        // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
        //boolean ret4 = 
        hdfs.rename(srcFilePath, destFilePath);

        //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4);
        return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1)
            sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

        // compute sorted order among part files
        ArrayList<Path> files = new ArrayList<Path>();
        for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);

        // first part file path
        Path firstpart = files.get(0);

        // create a temp file, and add header and contents of first part
        Path tmp = new Path(firstpart.toString() + ".tmp");
        OutputStream out = hdfs.create(tmp, true);
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy rest of the data from firstpart
        InputStream in = null;
        try {
            in = hdfs.open(firstpart);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }

        // rename tmp to firstpart
        hdfs.delete(firstpart, true);
        hdfs.rename(tmp, firstpart);

        // rename srcfile to destFile
        hdfs.delete(destFilePath, true);
        hdfs.createNewFile(destFilePath); // force the creation of directory structure
        hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
        hdfs.rename(srcFilePath, destFilePath); // move the data 

    } else if (hdfs.isFile(srcFilePath)) {
        // create destination file
        OutputStream out = hdfs.create(destFilePath, true);

        // write header
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy the data from srcFile
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java

License:Open Source License

/**
 * /*from   w w w.  j ava2s.  co  m*/
 * @param context
 * @param fs
 * @param taskOutput
 * @throws IOException
 */
private void moveFinalTaskOutputs(TaskAttemptContext context, FileSystem fs, Path taskOutput)
        throws IOException {
    context.getProgressible().progress();

    if (fs.getFileStatus(taskOutput).isDirectory()) {
        FileStatus[] files = fs.listStatus(taskOutput);
        if (files != null)
            for (FileStatus file : files) //for all files
                if (!file.isDirectory()) //skip directories
                    moveFileToDestination(context, fs, file.getPath());
    }
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java

License:Open Source License

private static void deletePartFiles(FileSystem fs, Path path) throws FileNotFoundException, IOException {
    PathFilter filter = new PathFilter() {
        public boolean accept(Path file) {
            return file.getName().startsWith("part-");
        }/*from   w w w  .  j  a  va2s .co m*/
    };
    FileStatus[] list = fs.listStatus(path, filter);
    for (FileStatus stat : list) {
        fs.delete(stat.getPath(), false);
    }
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Helper function to fetch and sort the list of part files under the given
 * input directory./*w  ww .  ja v a2 s.  c om*/
 * 
 * @param input
 * @param fs
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private static ArrayList<Path> collectInputFiles(String input, FileSystem fs)
        throws FileNotFoundException, IOException {
    Path path = new Path(input);
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    return files;
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

/**
 * //from  w w  w. j  a  v  a  2  s . c  om
 * @param dir
 * @return
 * @throws IOException
 */
public static String getSubDirs(String dir) throws IOException {
    FileSystem fs = FileSystem.get(_rJob);
    FileStatus[] files = fs.listStatus(new Path(dir));
    StringBuilder sb = new StringBuilder();
    for (FileStatus file : files) {
        if (sb.length() > 0)
            sb.append(",");
        sb.append(file.getPath().toString());
    }
    return sb.toString();
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

/**
 * /*from w w w .  ja va  2  s. c o  m*/
 * @param dir
 * @return
 * @throws IOException
 */
public static String getSubDirsIgnoreLogs(String dir) throws IOException {
    FileSystem fs = FileSystem.get(_rJob);
    FileStatus[] files = fs.listStatus(new Path(dir));
    StringBuilder sb = new StringBuilder();
    for (FileStatus file : files) {
        String name = file.getPath().toString();
        if (name.contains("_logs"))
            continue;
        if (sb.length() > 0)
            sb.append(",");
        sb.append(name);
    }
    return sb.toString();
}

From source file:com.ibm.bi.dml.runtime.util.MapReduceTool.java

License:Open Source License

public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p,
        boolean average) throws IOException {
    long[] counts = metadata.getNumItemsArray();
    long[] ranges = new long[counts.length];
    ranges[0] = counts[0];/*from ww w.ja  v a2s.c om*/
    for (int i = 1; i < counts.length; i++)
        ranges[i] = ranges[i - 1] + counts[i];

    long total = ranges[ranges.length - 1];

    // do averaging only if it is asked for; and sum_wt is even
    average = average && (total % 2 == 0);

    int currentPart = 0;
    double cum_weight = 0;
    long pos = (long) Math.ceil(total * p);
    while (ranges[currentPart] < pos) {
        currentPart++;
        cum_weight += ranges[currentPart];
    }
    int offset;
    if (currentPart > 0)
        offset = (int) (pos - ranges[currentPart - 1] - 1);
    else
        offset = (int) pos - 1;

    FileSystem fs = FileSystem.get(_rJob);
    Path path = new Path(dir);
    FileStatus[] files = fs.listStatus(path);
    Path fileToRead = null;
    for (FileStatus file : files)
        if (file.getPath().toString().endsWith(Integer.toString(currentPart))) {
            fileToRead = file.getPath();
            break;
        }

    if (fileToRead == null)
        throw new RuntimeException("cannot read partition " + currentPart);

    FSDataInputStream currentStream = fs.open(fileToRead);
    DoubleWritable readKey = new DoubleWritable();
    IntWritable readValue = new IntWritable();

    boolean contain0s = false;
    long numZeros = 0;
    if (currentPart == metadata.getPartitionOfZero()) {
        contain0s = true;
        numZeros = metadata.getNumberOfZero();
    }
    ReadWithZeros reader = new ReadWithZeros(currentStream, contain0s, numZeros);

    int numRead = 0;
    while (numRead <= offset) {
        reader.readNextKeyValuePairs(readKey, readValue);
        numRead += readValue.get();
        cum_weight += readValue.get();
    }

    double ret = readKey.get();
    if (average) {
        if (numRead <= offset + 1) {
            reader.readNextKeyValuePairs(readKey, readValue);
            cum_weight += readValue.get();
            ret = (ret + readKey.get()) / 2;
        }
    }
    currentStream.close();
    return new double[] { ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight) };
}

From source file:com.ibm.bi.dml.test.utils.TestUtils.java

License:Open Source License

/**
 * Compares contents of an expected file with the actual file, where rows may be permuted
 * @param expectedFile/* w  w w  .  j a v a2 s.  co  m*/
 * @param actualDir
 * @param epsilon
 */
public static void compareDMLMatrixWithJavaMatrixRowsOutOfOrder(String expectedFile, String actualDir,
        double epsilon) {
    try {
        FileSystem fs = FileSystem.get(conf);
        Path outDirectory = new Path(actualDir);
        Path compareFile = new Path(expectedFile);
        FSDataInputStream fsin = fs.open(compareFile);
        BufferedReader compareIn = new BufferedReader(new InputStreamReader(fsin));

        HashMap<CellIndex, Double> expectedValues = new HashMap<CellIndex, Double>();
        String line;
        while ((line = compareIn.readLine()) != null) {
            StringTokenizer st = new StringTokenizer(line, " ");
            int i = Integer.parseInt(st.nextToken());
            int j = Integer.parseInt(st.nextToken());
            double v = Double.parseDouble(st.nextToken());
            expectedValues.put(new CellIndex(i, j), v);
        }
        compareIn.close();

        HashMap<CellIndex, Double> actualValues = new HashMap<CellIndex, Double>();

        FileStatus[] outFiles = fs.listStatus(outDirectory);

        for (FileStatus file : outFiles) {
            FSDataInputStream fsout = fs.open(file.getPath());
            BufferedReader outIn = new BufferedReader(new InputStreamReader(fsout));

            while ((line = outIn.readLine()) != null) {
                StringTokenizer st = new StringTokenizer(line, " ");
                int i = Integer.parseInt(st.nextToken());
                int j = Integer.parseInt(st.nextToken());
                double v = Double.parseDouble(st.nextToken());
                actualValues.put(new CellIndex(i, j), v);
            }
            outIn.close();
        }

        ArrayList<Double> e_list = new ArrayList<Double>();
        for (CellIndex index : expectedValues.keySet()) {
            Double expectedValue = expectedValues.get(index);
            if (expectedValue != 0.0)
                e_list.add(expectedValue);
        }

        ArrayList<Double> a_list = new ArrayList<Double>();
        for (CellIndex index : actualValues.keySet()) {
            Double actualValue = actualValues.get(index);
            if (actualValue != 0.0)
                a_list.add(actualValue);
        }

        Collections.sort(e_list);
        Collections.sort(a_list);

        assertTrue("Matrix nzs not equal", e_list.size() == a_list.size());
        for (int i = 0; i < e_list.size(); i++) {
            assertTrue("Matrix values not equals", Math.abs(e_list.get(i) - a_list.get(i)) <= epsilon);
        }

    } catch (IOException e) {
        fail("unable to read file: " + e.getMessage());
    }
}