Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:nthu.scopelab.tsqr.ssvd.SSVDSolver.java

License:Apache License

/**
* helper capabiltiy to load distributed row matrices into dense matrix (to
* support tests mainly)./*  ww  w .  j av a2s.  c om*/
* 
* @param fs
*          filesystem
* @param glob
*          FS glob
* @param conf
*          configuration
* @return Dense matrix array
* @throws IOException
*           when I/O occurs.
*/
public static double[][] loadDistributedRowMatrix(FileSystem fs, Path glob, Configuration conf)
        throws IOException {

    FileStatus[] files = fs.globStatus(glob);
    if (files == null) {
        return null;
    }

    List<double[]> denseData = new ArrayList<double[]>();

    /*
     * assume it is partitioned output, so we need to read them up in order of
     * partitions.
     */
    Arrays.sort(files, PARTITION_COMPARATOR);
    SequenceFile.Reader reader = null;
    IntWritable key = null;
    VectorWritable value = null;
    for (FileStatus fstat : files) {
        reader = new SequenceFile.Reader(fs, fstat.getPath(), fs.getConf());
        try {
            key = (IntWritable) reader.getKeyClass().newInstance();
            value = (VectorWritable) reader.getValueClass().newInstance();
        } catch (Exception e) {
            e.printStackTrace();
        }
        while (reader.next(key, value)) {
            Vector v = value.get();
            int size = v.size();
            double[] row = new double[size];
            for (int i = 0; i < size; i++) {
                row[i] = v.get(i);
            }
            // ignore row label.
            denseData.add(row);
        }
    }
    if (reader != null)
        reader.close();
    return denseData.toArray(new double[denseData.size()][]);
}

From source file:nthu.scopelab.tsqr.ssvd.SSVDSolver.java

License:Apache License

/**
 * Load multiplel upper triangular matrices and sum them up.
 * //from   w w  w .j  ava  2  s  . c o  m
 * @param fs
 * @param glob
 * @param conf
 * @return the sum of upper triangular inputs.
 * @throws IOException
 */
public static cmUpperTriangDenseMatrix loadAndSumUpperTriangMatrices(FileSystem fs, Path glob,
        Configuration conf) throws IOException {

    FileStatus[] files = fs.globStatus(glob);
    if (files == null) {
        return null;
    }
    /*
     * assume it is partitioned output, so we need to read them up in order of
     * partitions.
     */
    Arrays.sort(files, PARTITION_COMPARATOR);
    DenseVector result = null;
    SequenceFile.Reader reader = null;
    Writable rkey;
    IntWritable key = null;
    VectorWritable value = null;
    for (FileStatus fstat : files) {
        reader = new SequenceFile.Reader(fs, fstat.getPath(), fs.getConf());
        try {
            key = (IntWritable) reader.getKeyClass().newInstance();
            value = (VectorWritable) reader.getValueClass().newInstance();
        } catch (Exception e) {
            e.printStackTrace();
        }
        while (reader.next(key, value)) {
            Vector v = value.get();
            if (result == null) {
                result = new DenseVector(v);
            } else {
                result.add(v);
            }
        }
    }
    if (reader != null)
        reader.close();
    if (result == null) {
        throw new IOException("Unexpected underrun in upper triangular matrix files");
    }
    return new cmUpperTriangDenseMatrix(result.getData());
}

From source file:org.apache.accumulo.core.util.TableDiskUsage.java

License:Apache License

public static void printDiskUsage(AccumuloConfiguration acuConf, Collection<String> tables, FileSystem fs,
        Connector conn, Printer printer, boolean humanReadable) throws TableNotFoundException, IOException {

    TableDiskUsage tdu = new TableDiskUsage();

    HashSet<String> tableIds = new HashSet<String>();

    for (String tableName : tables) {
        String tableId = conn.tableOperations().tableIdMap().get(tableName);
        if (tableId == null)
            throw new TableNotFoundException(null, tableName, "Table " + tableName + " not found");

        tableIds.add(tableId);/*from w  w w  . j  av a2  s .  c o m*/
    }

    for (String tableId : tableIds)
        tdu.addTable(tableId);

    HashSet<String> tablesReferenced = new HashSet<String>(tableIds);
    HashSet<String> emptyTableIds = new HashSet<String>();

    for (String tableId : tableIds) {
        Scanner mdScanner = conn.createScanner(Constants.METADATA_TABLE_NAME, Constants.NO_AUTHS);
        mdScanner.fetchColumnFamily(Constants.METADATA_DATAFILE_COLUMN_FAMILY);
        mdScanner.setRange(new KeyExtent(new Text(tableId), null, null).toMetadataRange());

        if (!mdScanner.iterator().hasNext()) {
            emptyTableIds.add(tableId);
        }

        for (Entry<Key, Value> entry : mdScanner) {
            String file = entry.getKey().getColumnQualifier().toString();
            if (file.startsWith("../")) {
                file = file.substring(2);
                tablesReferenced.add(file.split("\\/")[1]);
            } else
                file = "/" + tableId + file;

            tdu.linkFileAndTable(tableId, file);
        }
    }

    for (String tableId : tablesReferenced) {
        FileStatus[] files = fs.globStatus(new Path(Constants.getTablesDir(acuConf) + "/" + tableId + "/*/*"));

        for (FileStatus fileStatus : files) {
            String dir = fileStatus.getPath().getParent().getName();
            String name = fileStatus.getPath().getName();

            tdu.addFileSize("/" + tableId + "/" + dir + "/" + name, fileStatus.getLen());
        }

    }

    HashMap<String, String> reverseTableIdMap = new HashMap<String, String>();
    for (Entry<String, String> entry : conn.tableOperations().tableIdMap().entrySet())
        reverseTableIdMap.put(entry.getValue(), entry.getKey());

    TreeMap<TreeSet<String>, Long> usage = new TreeMap<TreeSet<String>, Long>(
            new Comparator<TreeSet<String>>() {

                @Override
                public int compare(TreeSet<String> o1, TreeSet<String> o2) {
                    int len1 = o1.size();
                    int len2 = o2.size();

                    int min = Math.min(len1, len2);

                    Iterator<String> iter1 = o1.iterator();
                    Iterator<String> iter2 = o2.iterator();

                    int count = 0;

                    while (count < min) {
                        String s1 = iter1.next();
                        String s2 = iter2.next();

                        int cmp = s1.compareTo(s2);

                        if (cmp != 0)
                            return cmp;

                        count++;
                    }

                    return len1 - len2;
                }
            });

    for (Entry<List<String>, Long> entry : tdu.calculateUsage().entrySet()) {
        TreeSet<String> tableNames = new TreeSet<String>();
        for (String tableId : entry.getKey())
            tableNames.add(reverseTableIdMap.get(tableId));

        usage.put(tableNames, entry.getValue());
    }

    if (!emptyTableIds.isEmpty()) {
        TreeSet<String> emptyTables = new TreeSet<String>();
        for (String tableId : emptyTableIds) {
            emptyTables.add(reverseTableIdMap.get(tableId));
        }
        usage.put(emptyTables, 0L);
    }

    for (Entry<TreeSet<String>, Long> entry : usage.entrySet()) {
        String valueFormat = humanReadable ? "%s" : "%,24d";
        Object value = humanReadable ? humanReadableBytes(entry.getValue()) : entry.getValue();
        printer.print(String.format(valueFormat + " %s", value, entry.getKey()));
    }
}

From source file:org.apache.accumulo.server.test.MultipleIndexIterator2.java

License:Apache License

private static void timeIterate(String dir, int maxFiles, String tmpDir) throws Exception {
    Configuration conf = CachedConfiguration.getInstance();
    FileSystem fs = FileSystem.get(conf);

    FileStatus[] files = fs.globStatus(new Path(dir + "/*/index"));
    ArrayList<Path> paths = new ArrayList<Path>(files.length);

    for (FileStatus fileStatus : files) {
        paths.add(fileStatus.getPath());
    }// ww  w .  ja va2  s . c o m

    long t1 = System.currentTimeMillis();
    ArrayList<Path> rpaths = reduceFiles(conf, fs, paths, maxFiles, tmpDir, 0);
    long t2 = System.currentTimeMillis();

    MultipleIndexIterator2 mii = new MultipleIndexIterator2(conf, fs, rpaths);

    int count = 0;
    while (mii.hasNext()) {
        mii.next();
        count++;
    }

    long t3 = System.currentTimeMillis();

    System.out.printf("reduce time  : %6.2f secs \n", (t2 - t1) / 1000.0);
    System.out.printf("iterate time : %6.2f secs \n", (t3 - t2) / 1000.0);
    System.out.printf("total time   : %6.2f secs \n", (t3 - t1) / 1000.0);

    System.out.println("count " + count);
}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroGenericOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenericStatsReducer.class);
    AvroJob.setOutputKeySchema(job, STATS_SCHEMA);
    AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroKeyOutputFormat.class, STATS_SCHEMA, null);
    AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroKeyOutputFormat.class, STATS_SCHEMA_2);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");
    outputPath.getFileSystem(job.getConfiguration()).delete(outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<GenericData.Record> reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }/*from   w w  w .  j a  v a  2 s .co  m*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro1-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA_2));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name1")).toString(), (Integer) record.get("count1"));
    }
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/testnewwrite-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/testnewwrite2-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA_2));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name1")).toString(), (Integer) record.get("count1"));
    }
    reader.close();
    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/testwritenonschema-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroSpecificOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    AvroMultipleOutputs.addNamedOutput(job, "myavro3", AvroKeyOutputFormat.class, TextStats.SCHEMA$, null);

    job.setReducerClass(SpecificStatsReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out-specific");
    outputPath.getFileSystem(job.getConfiguration()).delete(outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }//w  w  w .  ja  va 2s. c om
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroInput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString()));
    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, TextStats.SCHEMA$);
    AvroMultipleOutputs.addNamedOutput(job, "myavro3", AvroKeyOutputFormat.class, TextStats.SCHEMA$, null);

    job.setMapperClass(StatCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(SpecificStatsReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }/*w w w . j  av a 2  s. c om*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroMapOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString()));
    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, TextStats.SCHEMA$);

    job.setMapperClass(SortMapper.class);
    AvroJob.setMapOutputKeySchema(job, TextStats.SCHEMA$);
    job.setMapOutputValueClass(NullWritable.class);

    job.setReducerClass(SortReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }/* w  ww. j  a  v  a  2  s  .c  o  m*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestWordCount.java

License:Apache License

@Test
public void testAvroGenericOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenericStatsReducer.class);
    AvroJob.setOutputKeySchema(job, STATS_SCHEMA);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-generic");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<GenericData.Record> reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }/*from  w  w w .  ja va2 s. co m*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestWordCount.java

License:Apache License

@Test
public void testAvroSpecificOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(SpecificStatsReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }//from w ww  .j a va  2  s.  c o  m
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}