List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException
From source file:FlinkBootstrap.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new IllegalArgumentException( "Provide `TaskManager` or `JobManager` parameter with config folder"); }/*from w w w . ja v a2s. co m*/ //Load Hadoop S3 wrapper classes, due to ClassNotFound Exception without Class.forName("org.apache.flink.runtime.fs.hdfs.HadoopFileSystem"); Class.forName("org.apache.hadoop.fs.s3a.S3AFileSystem"); //Verify s3 is accessible Configuration conf = new Configuration(); conf.addResource(new Path("config/hadoop/core-site.xml")); conf.addResource(new Path("config/hadoop/hdfs-site.xml")); FileSystem fs = FileSystem.get(conf); fs.listStatus(new Path("s3://dir")); if (args[0].equals("TaskManager")) { TaskManager.main(new String[] { "--configDir", args[1], }); } else if (args[0].equals("JobManager")) { JobManager.main(new String[] { "--configDir", args[1], "--executionMode", "cluster", }); } else { throw new IllegalArgumentException("Unknown parameter `" + args[0] + "`"); } }
From source file:FDFGenData.java
License:Open Source License
public static void testwritefile(String tabledir, int num) throws Exception { String rawtmp = "/tmp/raw/rawfile"; FileSystem fs = FileSystem.get(new Configuration()); FileStatus[] fss = fs.listStatus(new Path(tabledir)); int x = 0;//w w w. j a v a 2 s . c o m if (fss != null) { x = fss.length; } PT.testgenrawfiler(rawtmp, num); PT.testwritefdf(tabledir + "file" + (x + 1), rawtmp, false, (short) -1); PT.testgenrawfiler(rawtmp, num); PT.testwritefdf(tabledir + "file" + (x + 2), rawtmp, false, (short) -1); PT.testgenrawfiler(rawtmp, num); PT.testwritefdf(tabledir + "file" + (x + 3), rawtmp, false, (short) -1); PT.testgenrawfiler(rawtmp, num); PT.testwritefdf(tabledir + "file" + (x + 4), rawtmp, false, (short) -1); PT.testgenrawfiler(rawtmp, num); PT.testwritefdf(tabledir + "file" + (x + 5), rawtmp, false, (short) -1); }
From source file:HBaseBloomFilterSemiJoinSystemTest.java
License:Apache License
private static void listFiles(FileSystem fs, Path path) throws IOException { for (FileStatus status : fs.listStatus(path)) { LOG.info(status.getPath().toString()); if (status.isDir()) { listFiles(fs, status.getPath()); }/* www.ja v a 2 s. com*/ } }
From source file:RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws Exception { Configuration conf = getConf(); String in = path + "/iter" + FORMAT.format(i); String out = path + "/iter" + FORMAT.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // We need to actually count the number of part files to get the number // of partitions (because the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) { numPartitions++;/*from w w w . j a v a 2 s. com*/ } } conf.setInt("NodeCount", n); Partitioner<IntWritable, Writable> p = null; if (useRange) { p = new RangePartitioner(); ((Configurable) p).setConf(conf); } else { p = new HashPartitioner<IntWritable, Writable>(); } // This is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (!f.getPath().getName().contains("part-")) { continue; } SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath())); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); LOG.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + ";"); } LOG.info(sb.toString().trim()); LOG.info("PageRankSchimmy: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + n); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInmapCombiner); LOG.info(" - numPartitions: " + numPartitions); LOG.info(" - useRange: " + useRange); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); //conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); Job job = Job.getInstance(conf); job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankSchimmy.class); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { job.setMapperClass(MapWithInMapperCombiningClass.class); } else { job.setMapperClass(MapClass.class); } if (useCombiner) { job.setCombinerClass(CombineClass.class); } if (useRange) { job.setPartitionerClass(RangePartitioner.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:RunPageRankBasic.java
License:Apache License
private float phase1(int i, int j, String basePath, int numNodes, boolean useCombiner, boolean useInMapperCombiner) throws Exception { Job job = Job.getInstance(getConf()); job.setJobName("PageRank:Basic:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankBasic.class); String in = basePath + "/iter" + formatter.format(i); String out = basePath + "/iter" + formatter.format(j) + "t"; String outm = out + "-mass"; // We need to actually count the number of part files to get the number of partitions (because // the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;//from w ww . j a v a 2 s .com } LOG.info("PageRank: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + numNodes); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInMapperCombiner); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; job.getConfiguration().setInt("NodeCount", numNodes); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.getConfiguration().set("PageRankMassPath", outm); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(useInMapperCombiner ? MapWithInMapperCombiningClass.class : MapClass.class); if (useCombiner) { job.setCombinerClass(CombineClass.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(getConf()).delete(new Path(out), true); FileSystem.get(getConf()).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; FileSystem fs = FileSystem.get(getConf()); for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:HiveKeyIgnoringBAMOutputFormat.java
License:Open Source License
private void setSAMHeaderFrom(JobConf job) throws IOException { if (wrappedOutputFormat.getSAMHeader() != null) return;//from w ww . j a va 2 s . co m // XXX: We're not told where to take the SAM header from so we just merge // them all. There should probably be a better way of doing this. final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); // The "best" sort order among the headers: unsorted if they're sorted // differently, otherwise their common sort order. SAMFileHeader.SortOrder sortOrder = null; // XXX: it seems that FileInputFormat.getInputPaths(job) will point to // the directories of the input tables in the query. I'm not sure if this // is always the case. for (final Path table : FileInputFormat.getInputPaths(job)) { final FileSystem fs = table.getFileSystem(job); for (final FileStatus stat : fs.listStatus(table)) { if (!stat.isFile()) throw new IOException("Unexpected directory '" + stat.getPath() + "', expected only files"); final SAMFileReader r = new SAMFileReader(fs.open(stat.getPath())); final SAMFileHeader h = r.getFileHeader(); r.close(); headers.add(h); if (sortOrder == null) { sortOrder = h.getSortOrder(); continue; } if (sortOrder == SAMFileHeader.SortOrder.unsorted) continue; if (sortOrder != h.getSortOrder()) sortOrder = SAMFileHeader.SortOrder.unsorted; } } wrappedOutputFormat.setSAMHeader(new SamFileHeaderMerger(sortOrder, headers, true).getMergedHeader()); }
From source file:TestIndexMergeMR.java
License:Open Source License
public void testIndexMergeMR() throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); String indexdir = "indexdir"; String indexdir1 = "indexdir1"; int filenum = 10; int recnum = 1000; short idx = 0; TestUtil.genifdfindex(indexdir, filenum, recnum, idx, true); StringBuffer sb = new StringBuffer(); FileStatus[] ss = fs.listStatus(new Path(indexdir)); for (FileStatus fileStatus : ss) { sb.append(fileStatus.getPath().toString()).append(","); }// w ww . ja va 2 s .c o m IndexMergeMR.running(sb.substring(0, sb.length() - 1), indexdir1, conf); IFormatDataFile ifdf = new IFormatDataFile(conf); ifdf.open(indexdir1 + "/part-00000"); for (int i = 0; i < 100; i++) { ifdf.next().show(); } ifdf.close(); fs.delete(new Path(indexdir), true); fs.delete(new Path(indexdir1), true); }
From source file:Vectors.java
License:Apache License
public static Vector readSequenceFile(Path path, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); for (FileStatus fileStatus : fs.listStatus(path)) { if (fileStatus.getPath().getName().contains("part-")) { SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader(fs, fileStatus.getPath(), conf); Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf); VectorWritable value = (VectorWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);//from w w w . java 2 s . com reader.next(key, value); return value.get(); } finally { IOUtils.closeStream(reader); } } } return null; }
From source file:ClassifierHD.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(//from w w w . ja va 2 s.co m "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }
From source file:AggregatedLogsPurger.java
License:Apache License
public boolean purge() throws IOException { LocalDateTime now = LocalDateTime.now(); LocalDateTime deleteLogsOlderThanTime = now.minusDays(deleteOlderThanDays); //Identify which log dirs should be deleted FileSystem fs = rootLogDir.getFileSystem(conf); try {// ww w .j a v a 2 s . c om long totalBytes = 0; for (FileStatus userDir : fs.listStatus(rootLogDir)) { if (userDir.isDirectory()) { Path userDirPath = new Path(userDir.getPath(), suffix); System.out.println("Checking for userDir : " + userDirPath); for (FileStatus appDir : fs.listStatus(userDirPath)) { LocalDateTime appDirDate = getAppDirDateTime(appDir.getModificationTime()); if (appDirDate.isBefore(deleteLogsOlderThanTime)) { long size = getLengthRecursively(fs, appDir.getPath()); System.out.println(appDir.getPath() + ", " + appDir.getOwner() + ", " + appDirDate.toString() + ", size=" + size); totalBytes += size; if (shouldDelete) { System.out.println("Deleting " + appDir.getPath()); fs.delete(appDir.getPath(), true); } } } } } System.out.println("Savings : " + totalBytes); } catch (IOException e) { e.printStackTrace(); return false; } finally { fs.close(); } return true; }