Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:nthu.scopelab.tsqr.ssvd.SSVDSolver.java

License:Apache License

/**
* helper capabiltiy to load distributed row matrices into dense matrix (to
* support tests mainly)./*  ww  w .  j av a2s.  c om*/
* 
* @param fs
*          filesystem
* @param glob
*          FS glob
* @param conf
*          configuration
* @return Dense matrix array
* @throws IOException
*           when I/O occurs.
*/
public static double[][] loadDistributedRowMatrix(FileSystem fs, Path glob, Configuration conf)
        throws IOException {

    FileStatus[] files = fs.globStatus(glob);
    if (files == null) {
        return null;
    }

    List<double[]> denseData = new ArrayList<double[]>();

    /*
     * assume it is partitioned output, so we need to read them up in order of
     * partitions.
     */
    Arrays.sort(files, PARTITION_COMPARATOR);
    SequenceFile.Reader reader = null;
    IntWritable key = null;
    VectorWritable value = null;
    for (FileStatus fstat : files) {
        reader = new SequenceFile.Reader(fs, fstat.getPath(), fs.getConf());
        try {
            key = (IntWritable) reader.getKeyClass().newInstance();
            value = (VectorWritable) reader.getValueClass().newInstance();
        } catch (Exception e) {
            e.printStackTrace();
        }
        while (reader.next(key, value)) {
            Vector v = value.get();
            int size = v.size();
            double[] row = new double[size];
            for (int i = 0; i < size; i++) {
                row[i] = v.get(i);
            }
            // ignore row label.
            denseData.add(row);
        }
    }
    if (reader != null)
        reader.close();
    return denseData.toArray(new double[denseData.size()][]);
}

From source file:nthu.scopelab.tsqr.ssvd.SSVDSolver.java

License:Apache License

/**
 * Load multiplel upper triangular matrices and sum them up.
 * //from   w w  w .j  ava  2  s  . c o  m
 * @param fs
 * @param glob
 * @param conf
 * @return the sum of upper triangular inputs.
 * @throws IOException
 */
public static cmUpperTriangDenseMatrix loadAndSumUpperTriangMatrices(FileSystem fs, Path glob,
        Configuration conf) throws IOException {

    FileStatus[] files = fs.globStatus(glob);
    if (files == null) {
        return null;
    }
    /*
     * assume it is partitioned output, so we need to read them up in order of
     * partitions.
     */
    Arrays.sort(files, PARTITION_COMPARATOR);
    DenseVector result = null;
    SequenceFile.Reader reader = null;
    Writable rkey;
    IntWritable key = null;
    VectorWritable value = null;
    for (FileStatus fstat : files) {
        reader = new SequenceFile.Reader(fs, fstat.getPath(), fs.getConf());
        try {
            key = (IntWritable) reader.getKeyClass().newInstance();
            value = (VectorWritable) reader.getValueClass().newInstance();
        } catch (Exception e) {
            e.printStackTrace();
        }
        while (reader.next(key, value)) {
            Vector v = value.get();
            if (result == null) {
                result = new DenseVector(v);
            } else {
                result.add(v);
            }
        }
    }
    if (reader != null)
        reader.close();
    if (result == null) {
        throw new IOException("Unexpected underrun in upper triangular matrix files");
    }
    return new cmUpperTriangDenseMatrix(result.getData());
}

From source file:org.apache.accumulo.core.util.TableDiskUsage.java

License:Apache License

public static void printDiskUsage(AccumuloConfiguration acuConf, Collection<String> tables, FileSystem fs,
        Connector conn, Printer printer, boolean humanReadable) throws TableNotFoundException, IOException {

    TableDiskUsage tdu = new TableDiskUsage();

    HashSet<String> tableIds = new HashSet<String>();

    for (String tableName : tables) {
        String tableId = conn.tableOperations().tableIdMap().get(tableName);
        if (tableId == null)
            throw new TableNotFoundException(null, tableName, "Table " + tableName + " not found");

        tableIds.add(tableId);/*from w  w w  . j  av a2  s .  c o m*/
    }

    for (String tableId : tableIds)
        tdu.addTable(tableId);

    HashSet<String> tablesReferenced = new HashSet<String>(tableIds);
    HashSet<String> emptyTableIds = new HashSet<String>();

    for (String tableId : tableIds) {
        Scanner mdScanner = conn.createScanner(Constants.METADATA_TABLE_NAME, Constants.NO_AUTHS);
        mdScanner.fetchColumnFamily(Constants.METADATA_DATAFILE_COLUMN_FAMILY);
        mdScanner.setRange(new KeyExtent(new Text(tableId), null, null).toMetadataRange());

        if (!mdScanner.iterator().hasNext()) {
            emptyTableIds.add(tableId);
        }

        for (Entry<Key, Value> entry : mdScanner) {
            String file = entry.getKey().getColumnQualifier().toString();
            if (file.startsWith("../")) {
                file = file.substring(2);
                tablesReferenced.add(file.split("\\/")[1]);
            } else
                file = "/" + tableId + file;

            tdu.linkFileAndTable(tableId, file);
        }
    }

    for (String tableId : tablesReferenced) {
        FileStatus[] files = fs.globStatus(new Path(Constants.getTablesDir(acuConf) + "/" + tableId + "/*/*"));

        for (FileStatus fileStatus : files) {
            String dir = fileStatus.getPath().getParent().getName();
            String name = fileStatus.getPath().getName();

            tdu.addFileSize("/" + tableId + "/" + dir + "/" + name, fileStatus.getLen());
        }

    }

    HashMap<String, String> reverseTableIdMap = new HashMap<String, String>();
    for (Entry<String, String> entry : conn.tableOperations().tableIdMap().entrySet())
        reverseTableIdMap.put(entry.getValue(), entry.getKey());

    TreeMap<TreeSet<String>, Long> usage = new TreeMap<TreeSet<String>, Long>(
            new Comparator<TreeSet<String>>() {

                @Override
                public int compare(TreeSet<String> o1, TreeSet<String> o2) {
                    int len1 = o1.size();
                    int len2 = o2.size();

                    int min = Math.min(len1, len2);

                    Iterator<String> iter1 = o1.iterator();
                    Iterator<String> iter2 = o2.iterator();

                    int count = 0;

                    while (count < min) {
                        String s1 = iter1.next();
                        String s2 = iter2.next();

                        int cmp = s1.compareTo(s2);

                        if (cmp != 0)
                            return cmp;

                        count++;
                    }

                    return len1 - len2;
                }
            });

    for (Entry<List<String>, Long> entry : tdu.calculateUsage().entrySet()) {
        TreeSet<String> tableNames = new TreeSet<String>();
        for (String tableId : entry.getKey())
            tableNames.add(reverseTableIdMap.get(tableId));

        usage.put(tableNames, entry.getValue());
    }

    if (!emptyTableIds.isEmpty()) {
        TreeSet<String> emptyTables = new TreeSet<String>();
        for (String tableId : emptyTableIds) {
            emptyTables.add(reverseTableIdMap.get(tableId));
        }
        usage.put(emptyTables, 0L);
    }

    for (Entry<TreeSet<String>, Long> entry : usage.entrySet()) {
        String valueFormat = humanReadable ? "%s" : "%,24d";
        Object value = humanReadable ? humanReadableBytes(entry.getValue()) : entry.getValue();
        printer.print(String.format(valueFormat + " %s", value, entry.getKey()));
    }
}

From source file:org.apache.accumulo.server.test.MultipleIndexIterator2.java

License:Apache License

private static void timeIterate(String dir, int maxFiles, String tmpDir) throws Exception {
    Configuration conf = CachedConfiguration.getInstance();
    FileSystem fs = FileSystem.get(conf);

    FileStatus[] files = fs.globStatus(new Path(dir + "/*/index"));
    ArrayList<Path> paths = new ArrayList<Path>(files.length);

    for (FileStatus fileStatus : files) {
        paths.add(fileStatus.getPath());
    }// ww  w .  ja va2  s . c o m

    long t1 = System.currentTimeMillis();
    ArrayList<Path> rpaths = reduceFiles(conf, fs, paths, maxFiles, tmpDir, 0);
    long t2 = System.currentTimeMillis();

    MultipleIndexIterator2 mii = new MultipleIndexIterator2(conf, fs, rpaths);

    int count = 0;
    while (mii.hasNext()) {
        mii.next();
        count++;
    }

    long t3 = System.currentTimeMillis();

    System.out.printf("reduce time  : %6.2f secs \n", (t2 - t1) / 1000.0);
    System.out.printf("iterate time : %6.2f secs \n", (t3 - t2) / 1000.0);
    System.out.printf("total time   : %6.2f secs \n", (t3 - t1) / 1000.0);

    System.out.println("count " + count);
}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroGenericOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenericStatsReducer.class);
    AvroJob.setOutputKeySchema(job, STATS_SCHEMA);
    AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroKeyOutputFormat.class, STATS_SCHEMA, null);
    AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroKeyOutputFormat.class, STATS_SCHEMA_2);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");
    outputPath.getFileSystem(job.getConfiguration()).delete(outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<GenericData.Record> reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }/*from   w w  w .  j a  v a  2 s .co  m*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro1-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA_2));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name1")).toString(), (Integer) record.get("count1"));
    }
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/testnewwrite-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/testnewwrite2-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA_2));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name1")).toString(), (Integer) record.get("count1"));
    }
    reader.close();
    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

    outputFiles = fileSystem.globStatus(outputPath.suffix("/testwritenonschema-r-00000.avro"));
    Assert.assertEquals(1, outputFiles.length);
    reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());

}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroSpecificOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    AvroMultipleOutputs.addNamedOutput(job, "myavro3", AvroKeyOutputFormat.class, TextStats.SCHEMA$, null);

    job.setReducerClass(SpecificStatsReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out-specific");
    outputPath.getFileSystem(job.getConfiguration()).delete(outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }//w  w  w .  ja  va 2s. c om
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroInput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString()));
    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, TextStats.SCHEMA$);
    AvroMultipleOutputs.addNamedOutput(job, "myavro3", AvroKeyOutputFormat.class, TextStats.SCHEMA$, null);

    job.setMapperClass(StatCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(SpecificStatsReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }/*w w w . j  av a 2  s. c om*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java

License:Apache License

@Test
public void testAvroMapOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString()));
    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroJob.setInputKeySchema(job, TextStats.SCHEMA$);

    job.setMapperClass(SortMapper.class);
    AvroJob.setMapOutputKeySchema(job, TextStats.SCHEMA$);
    job.setMapOutputValueClass(NullWritable.class);

    job.setReducerClass(SortReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }/* w  ww. j  a  v  a  2  s  .c  o  m*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestWordCount.java

License:Apache License

@Test
public void testAvroGenericOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenericStatsReducer.class);
    AvroJob.setOutputKeySchema(job, STATS_SCHEMA);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-generic");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<GenericData.Record> reader = new DataFileReader<GenericData.Record>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new GenericDatumReader<GenericData.Record>(STATS_SCHEMA));
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (GenericData.Record record : reader) {
        counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
    }/*from  w  w w .  ja va2 s. co m*/
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}

From source file:org.apache.avro.mapreduce.TestWordCount.java

License:Apache License

@Test
public void testAvroSpecificOutput() throws Exception {
    Job job = new Job();

    FileInputFormat.setInputPaths(job, new Path(
            getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(LineCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(SpecificStatsReducer.class);
    AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);

    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific");
    FileOutputFormat.setOutputPath(job, outputPath);

    Assert.assertTrue(job.waitForCompletion(true));

    // Check that the results from the MapReduce were as expected.
    FileSystem fileSystem = FileSystem.get(job.getConfiguration());
    FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
    Assert.assertEquals(1, outputFiles.length);
    DataFileReader<TextStats> reader = new DataFileReader<TextStats>(
            new FsInput(outputFiles[0].getPath(), job.getConfiguration()),
            new SpecificDatumReader<TextStats>());
    Map<String, Integer> counts = new HashMap<String, Integer>();
    for (TextStats record : reader) {
        counts.put(record.name.toString(), record.count);
    }//from w ww  .j a va  2  s.  c o  m
    reader.close();

    Assert.assertEquals(3, counts.get("apple").intValue());
    Assert.assertEquals(2, counts.get("banana").intValue());
    Assert.assertEquals(1, counts.get("carrot").intValue());
}