List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:nthu.scopelab.tsqr.ssvd.SSVDSolver.java
License:Apache License
/** * helper capabiltiy to load distributed row matrices into dense matrix (to * support tests mainly)./* ww w . j av a2s. c om*/ * * @param fs * filesystem * @param glob * FS glob * @param conf * configuration * @return Dense matrix array * @throws IOException * when I/O occurs. */ public static double[][] loadDistributedRowMatrix(FileSystem fs, Path glob, Configuration conf) throws IOException { FileStatus[] files = fs.globStatus(glob); if (files == null) { return null; } List<double[]> denseData = new ArrayList<double[]>(); /* * assume it is partitioned output, so we need to read them up in order of * partitions. */ Arrays.sort(files, PARTITION_COMPARATOR); SequenceFile.Reader reader = null; IntWritable key = null; VectorWritable value = null; for (FileStatus fstat : files) { reader = new SequenceFile.Reader(fs, fstat.getPath(), fs.getConf()); try { key = (IntWritable) reader.getKeyClass().newInstance(); value = (VectorWritable) reader.getValueClass().newInstance(); } catch (Exception e) { e.printStackTrace(); } while (reader.next(key, value)) { Vector v = value.get(); int size = v.size(); double[] row = new double[size]; for (int i = 0; i < size; i++) { row[i] = v.get(i); } // ignore row label. denseData.add(row); } } if (reader != null) reader.close(); return denseData.toArray(new double[denseData.size()][]); }
From source file:nthu.scopelab.tsqr.ssvd.SSVDSolver.java
License:Apache License
/** * Load multiplel upper triangular matrices and sum them up. * //from w w w .j ava 2 s . c o m * @param fs * @param glob * @param conf * @return the sum of upper triangular inputs. * @throws IOException */ public static cmUpperTriangDenseMatrix loadAndSumUpperTriangMatrices(FileSystem fs, Path glob, Configuration conf) throws IOException { FileStatus[] files = fs.globStatus(glob); if (files == null) { return null; } /* * assume it is partitioned output, so we need to read them up in order of * partitions. */ Arrays.sort(files, PARTITION_COMPARATOR); DenseVector result = null; SequenceFile.Reader reader = null; Writable rkey; IntWritable key = null; VectorWritable value = null; for (FileStatus fstat : files) { reader = new SequenceFile.Reader(fs, fstat.getPath(), fs.getConf()); try { key = (IntWritable) reader.getKeyClass().newInstance(); value = (VectorWritable) reader.getValueClass().newInstance(); } catch (Exception e) { e.printStackTrace(); } while (reader.next(key, value)) { Vector v = value.get(); if (result == null) { result = new DenseVector(v); } else { result.add(v); } } } if (reader != null) reader.close(); if (result == null) { throw new IOException("Unexpected underrun in upper triangular matrix files"); } return new cmUpperTriangDenseMatrix(result.getData()); }
From source file:org.apache.accumulo.core.util.TableDiskUsage.java
License:Apache License
public static void printDiskUsage(AccumuloConfiguration acuConf, Collection<String> tables, FileSystem fs, Connector conn, Printer printer, boolean humanReadable) throws TableNotFoundException, IOException { TableDiskUsage tdu = new TableDiskUsage(); HashSet<String> tableIds = new HashSet<String>(); for (String tableName : tables) { String tableId = conn.tableOperations().tableIdMap().get(tableName); if (tableId == null) throw new TableNotFoundException(null, tableName, "Table " + tableName + " not found"); tableIds.add(tableId);/*from w w w . j av a2 s . c o m*/ } for (String tableId : tableIds) tdu.addTable(tableId); HashSet<String> tablesReferenced = new HashSet<String>(tableIds); HashSet<String> emptyTableIds = new HashSet<String>(); for (String tableId : tableIds) { Scanner mdScanner = conn.createScanner(Constants.METADATA_TABLE_NAME, Constants.NO_AUTHS); mdScanner.fetchColumnFamily(Constants.METADATA_DATAFILE_COLUMN_FAMILY); mdScanner.setRange(new KeyExtent(new Text(tableId), null, null).toMetadataRange()); if (!mdScanner.iterator().hasNext()) { emptyTableIds.add(tableId); } for (Entry<Key, Value> entry : mdScanner) { String file = entry.getKey().getColumnQualifier().toString(); if (file.startsWith("../")) { file = file.substring(2); tablesReferenced.add(file.split("\\/")[1]); } else file = "/" + tableId + file; tdu.linkFileAndTable(tableId, file); } } for (String tableId : tablesReferenced) { FileStatus[] files = fs.globStatus(new Path(Constants.getTablesDir(acuConf) + "/" + tableId + "/*/*")); for (FileStatus fileStatus : files) { String dir = fileStatus.getPath().getParent().getName(); String name = fileStatus.getPath().getName(); tdu.addFileSize("/" + tableId + "/" + dir + "/" + name, fileStatus.getLen()); } } HashMap<String, String> reverseTableIdMap = new HashMap<String, String>(); for (Entry<String, String> entry : conn.tableOperations().tableIdMap().entrySet()) reverseTableIdMap.put(entry.getValue(), entry.getKey()); TreeMap<TreeSet<String>, Long> usage = new TreeMap<TreeSet<String>, Long>( new Comparator<TreeSet<String>>() { @Override public int compare(TreeSet<String> o1, TreeSet<String> o2) { int len1 = o1.size(); int len2 = o2.size(); int min = Math.min(len1, len2); Iterator<String> iter1 = o1.iterator(); Iterator<String> iter2 = o2.iterator(); int count = 0; while (count < min) { String s1 = iter1.next(); String s2 = iter2.next(); int cmp = s1.compareTo(s2); if (cmp != 0) return cmp; count++; } return len1 - len2; } }); for (Entry<List<String>, Long> entry : tdu.calculateUsage().entrySet()) { TreeSet<String> tableNames = new TreeSet<String>(); for (String tableId : entry.getKey()) tableNames.add(reverseTableIdMap.get(tableId)); usage.put(tableNames, entry.getValue()); } if (!emptyTableIds.isEmpty()) { TreeSet<String> emptyTables = new TreeSet<String>(); for (String tableId : emptyTableIds) { emptyTables.add(reverseTableIdMap.get(tableId)); } usage.put(emptyTables, 0L); } for (Entry<TreeSet<String>, Long> entry : usage.entrySet()) { String valueFormat = humanReadable ? "%s" : "%,24d"; Object value = humanReadable ? humanReadableBytes(entry.getValue()) : entry.getValue(); printer.print(String.format(valueFormat + " %s", value, entry.getKey())); } }
From source file:org.apache.accumulo.server.test.MultipleIndexIterator2.java
License:Apache License
private static void timeIterate(String dir, int maxFiles, String tmpDir) throws Exception { Configuration conf = CachedConfiguration.getInstance(); FileSystem fs = FileSystem.get(conf); FileStatus[] files = fs.globStatus(new Path(dir + "/*/index")); ArrayList<Path> paths = new ArrayList<Path>(files.length); for (FileStatus fileStatus : files) { paths.add(fileStatus.getPath()); }// ww w . ja va2 s . c o m long t1 = System.currentTimeMillis(); ArrayList<Path> rpaths = reduceFiles(conf, fs, paths, maxFiles, tmpDir, 0); long t2 = System.currentTimeMillis(); MultipleIndexIterator2 mii = new MultipleIndexIterator2(conf, fs, rpaths); int count = 0; while (mii.hasNext()) { mii.next(); count++; } long t3 = System.currentTimeMillis(); System.out.printf("reduce time : %6.2f secs \n", (t2 - t1) / 1000.0); System.out.printf("iterate time : %6.2f secs \n", (t3 - t2) / 1000.0); System.out.printf("total time : %6.2f secs \n", (t3 - t1) / 1000.0); System.out.println("count " + count); }
From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java
License:Apache License
@Test public void testAvroGenericOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path( getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); AvroJob.setOutputKeySchema(job, STATS_SCHEMA); AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroKeyOutputFormat.class, STATS_SCHEMA, null); AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroKeyOutputFormat.class, STATS_SCHEMA_2); job.setOutputFormatClass(AvroKeyOutputFormat.class); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job.getConfiguration()).delete(outputPath); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro-r-00000.avro")); Assert.assertEquals(1, outputFiles.length); DataFileReader<GenericData.Record> reader = new DataFileReader<GenericData.Record>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<GenericData.Record>(STATS_SCHEMA)); Map<String, Integer> counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count")); }/*from w w w . j a v a 2 s .co m*/ reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro1-r-00000.avro")); Assert.assertEquals(1, outputFiles.length); reader = new DataFileReader<GenericData.Record>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<GenericData.Record>(STATS_SCHEMA_2)); counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(((Utf8) record.get("name1")).toString(), (Integer) record.get("count1")); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); outputFiles = fileSystem.globStatus(outputPath.suffix("/testnewwrite-r-00000.avro")); Assert.assertEquals(1, outputFiles.length); reader = new DataFileReader<GenericData.Record>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<GenericData.Record>(STATS_SCHEMA)); counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count")); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); outputFiles = fileSystem.globStatus(outputPath.suffix("/testnewwrite2-r-00000.avro")); Assert.assertEquals(1, outputFiles.length); reader = new DataFileReader<GenericData.Record>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<GenericData.Record>(STATS_SCHEMA_2)); counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(((Utf8) record.get("name1")).toString(), (Integer) record.get("count1")); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); outputFiles = fileSystem.globStatus(outputPath.suffix("/testwritenonschema-r-00000.avro")); Assert.assertEquals(1, outputFiles.length); reader = new DataFileReader<GenericData.Record>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<GenericData.Record>(STATS_SCHEMA)); counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count")); } reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); }
From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java
License:Apache License
@Test public void testAvroSpecificOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path( getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); AvroMultipleOutputs.addNamedOutput(job, "myavro3", AvroKeyOutputFormat.class, TextStats.SCHEMA$, null); job.setReducerClass(SpecificStatsReducer.class); AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$); job.setOutputFormatClass(AvroKeyOutputFormat.class); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out-specific"); outputPath.getFileSystem(job.getConfiguration()).delete(outputPath); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<TextStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<TextStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (TextStats record : reader) { counts.put(record.name.toString(), record.count); }//w w w . ja va 2s. c om reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); }
From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java
License:Apache License
@Test public void testAvroInput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path( getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString())); job.setInputFormatClass(AvroKeyInputFormat.class); AvroJob.setInputKeySchema(job, TextStats.SCHEMA$); AvroMultipleOutputs.addNamedOutput(job, "myavro3", AvroKeyOutputFormat.class, TextStats.SCHEMA$, null); job.setMapperClass(StatCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(SpecificStatsReducer.class); AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<TextStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<TextStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (TextStats record : reader) { counts.put(record.name.toString(), record.count); }/*w w w . j av a 2 s. c om*/ reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); }
From source file:org.apache.avro.mapreduce.TestAvroMultipleOutputs.java
License:Apache License
@Test public void testAvroMapOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path( getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString())); job.setInputFormatClass(AvroKeyInputFormat.class); AvroJob.setInputKeySchema(job, TextStats.SCHEMA$); job.setMapperClass(SortMapper.class); AvroJob.setMapOutputKeySchema(job, TextStats.SCHEMA$); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SortReducer.class); AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<TextStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<TextStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (TextStats record : reader) { counts.put(record.name.toString(), record.count); }/* w ww. j a v a 2 s .c o m*/ reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); }
From source file:org.apache.avro.mapreduce.TestWordCount.java
License:Apache License
@Test public void testAvroGenericOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path( getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); AvroJob.setOutputKeySchema(job, STATS_SCHEMA); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-generic"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<GenericData.Record> reader = new DataFileReader<GenericData.Record>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<GenericData.Record>(STATS_SCHEMA)); Map<String, Integer> counts = new HashMap<String, Integer>(); for (GenericData.Record record : reader) { counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count")); }/*from w w w . ja va2 s. co m*/ reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); }
From source file:org.apache.avro.mapreduce.TestWordCount.java
License:Apache License
@Test public void testAvroSpecificOutput() throws Exception { Job job = new Job(); FileInputFormat.setInputPaths(job, new Path( getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString())); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(SpecificStatsReducer.class); AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$); job.setOutputFormatClass(AvroKeyOutputFormat.class); Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific"); FileOutputFormat.setOutputPath(job, outputPath); Assert.assertTrue(job.waitForCompletion(true)); // Check that the results from the MapReduce were as expected. FileSystem fileSystem = FileSystem.get(job.getConfiguration()); FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<TextStats>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<TextStats>()); Map<String, Integer> counts = new HashMap<String, Integer>(); for (TextStats record : reader) { counts.put(record.name.toString(), record.count); }//from w ww .j a va 2 s. c o m reader.close(); Assert.assertEquals(3, counts.get("apple").intValue()); Assert.assertEquals(2, counts.get("banana").intValue()); Assert.assertEquals(1, counts.get("carrot").intValue()); }