Example usage for org.apache.hadoop.fs FileSystem listStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listStatus.

Prototype

public FileStatus[] listStatus(Path[] files) throws FileNotFoundException, IOException

Source Link

Document

Filter files/directories in the given list of paths using default path filter.

Usage

From source file:FlinkBootstrap.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 2) {
        throw new IllegalArgumentException(
                "Provide `TaskManager` or `JobManager` parameter with config folder");
    }/*from   w w  w . ja  v a2s. co m*/

    //Load Hadoop S3 wrapper classes, due to ClassNotFound Exception without
    Class.forName("org.apache.flink.runtime.fs.hdfs.HadoopFileSystem");
    Class.forName("org.apache.hadoop.fs.s3a.S3AFileSystem");

    //Verify s3 is accessible
    Configuration conf = new Configuration();
    conf.addResource(new Path("config/hadoop/core-site.xml"));
    conf.addResource(new Path("config/hadoop/hdfs-site.xml"));
    FileSystem fs = FileSystem.get(conf);
    fs.listStatus(new Path("s3://dir"));

    if (args[0].equals("TaskManager")) {
        TaskManager.main(new String[] { "--configDir", args[1], });
    } else if (args[0].equals("JobManager")) {
        JobManager.main(new String[] { "--configDir", args[1], "--executionMode", "cluster", });
    } else {
        throw new IllegalArgumentException("Unknown parameter `" + args[0] + "`");
    }
}

From source file:FDFGenData.java

License:Open Source License

public static void testwritefile(String tabledir, int num) throws Exception {

    String rawtmp = "/tmp/raw/rawfile";

    FileSystem fs = FileSystem.get(new Configuration());
    FileStatus[] fss = fs.listStatus(new Path(tabledir));
    int x = 0;//w  w  w. j a  v a 2 s .  c o  m
    if (fss != null) {
        x = fss.length;
    }

    PT.testgenrawfiler(rawtmp, num);
    PT.testwritefdf(tabledir + "file" + (x + 1), rawtmp, false, (short) -1);
    PT.testgenrawfiler(rawtmp, num);
    PT.testwritefdf(tabledir + "file" + (x + 2), rawtmp, false, (short) -1);
    PT.testgenrawfiler(rawtmp, num);
    PT.testwritefdf(tabledir + "file" + (x + 3), rawtmp, false, (short) -1);
    PT.testgenrawfiler(rawtmp, num);
    PT.testwritefdf(tabledir + "file" + (x + 4), rawtmp, false, (short) -1);
    PT.testgenrawfiler(rawtmp, num);
    PT.testwritefdf(tabledir + "file" + (x + 5), rawtmp, false, (short) -1);
}

From source file:HBaseBloomFilterSemiJoinSystemTest.java

License:Apache License

private static void listFiles(FileSystem fs, Path path) throws IOException {
    for (FileStatus status : fs.listStatus(path)) {
        LOG.info(status.getPath().toString());
        if (status.isDir()) {
            listFiles(fs, status.getPath());
        }/* www.ja  v  a 2 s. com*/
    }
}

From source file:RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws Exception {
    Configuration conf = getConf();

    String in = path + "/iter" + FORMAT.format(i);
    String out = path + "/iter" + FORMAT.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // We need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-")) {
            numPartitions++;/*from   w  w  w  . j  a  v a 2 s. com*/
        }
    }

    conf.setInt("NodeCount", n);

    Partitioner<IntWritable, Writable> p = null;

    if (useRange) {
        p = new RangePartitioner();
        ((Configurable) p).setConf(conf);
    } else {
        p = new HashPartitioner<IntWritable, Writable>();
    }

    // This is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (!f.getPath().getName().contains("part-")) {
            continue;
        }

        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        LOG.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + ";");
    }

    LOG.info(sb.toString().trim());

    LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + n);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInmapCombiner);
    LOG.info(" - numPartitions: " + numPartitions);
    LOG.info(" - useRange: " + useRange);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    //conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = Job.getInstance(conf);
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankSchimmy.class);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        job.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        job.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:RunPageRankBasic.java

License:Apache License

private float phase1(int i, int j, String basePath, int numNodes, boolean useCombiner,
        boolean useInMapperCombiner) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankBasic.class);

    String in = basePath + "/iter" + formatter.format(i);
    String out = basePath + "/iter" + formatter.format(j) + "t";
    String outm = out + "-mass";

    // We need to actually count the number of part files to get the number of partitions (because
    // the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;//from w ww  . j a v a  2 s  .com
    }

    LOG.info("PageRank: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + numNodes);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInMapperCombiner);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    job.getConfiguration().setInt("NodeCount", numNodes);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");
    job.getConfiguration().set("PageRankMassPath", outm);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(useInMapperCombiner ? MapWithInMapperCombiningClass.class : MapClass.class);

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);
    FileSystem.get(getConf()).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(getConf());
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:HiveKeyIgnoringBAMOutputFormat.java

License:Open Source License

private void setSAMHeaderFrom(JobConf job) throws IOException {
    if (wrappedOutputFormat.getSAMHeader() != null)
        return;//from w ww . j  a  va  2  s  .  co  m

    // XXX: We're not told where to take the SAM header from so we just merge
    // them all. There should probably be a better way of doing this.

    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();

    // The "best" sort order among the headers: unsorted if they're sorted
    // differently, otherwise their common sort order.
    SAMFileHeader.SortOrder sortOrder = null;

    // XXX: it seems that FileInputFormat.getInputPaths(job) will point to
    // the directories of the input tables in the query. I'm not sure if this
    // is always the case.
    for (final Path table : FileInputFormat.getInputPaths(job)) {
        final FileSystem fs = table.getFileSystem(job);
        for (final FileStatus stat : fs.listStatus(table)) {
            if (!stat.isFile())
                throw new IOException("Unexpected directory '" + stat.getPath() + "', expected only files");

            final SAMFileReader r = new SAMFileReader(fs.open(stat.getPath()));
            final SAMFileHeader h = r.getFileHeader();
            r.close();
            headers.add(h);

            if (sortOrder == null) {
                sortOrder = h.getSortOrder();
                continue;
            }
            if (sortOrder == SAMFileHeader.SortOrder.unsorted)
                continue;
            if (sortOrder != h.getSortOrder())
                sortOrder = SAMFileHeader.SortOrder.unsorted;
        }
    }

    wrappedOutputFormat.setSAMHeader(new SamFileHeaderMerger(sortOrder, headers, true).getMergedHeader());
}

From source file:TestIndexMergeMR.java

License:Open Source License

public void testIndexMergeMR() throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    String indexdir = "indexdir";
    String indexdir1 = "indexdir1";
    int filenum = 10;
    int recnum = 1000;
    short idx = 0;
    TestUtil.genifdfindex(indexdir, filenum, recnum, idx, true);
    StringBuffer sb = new StringBuffer();
    FileStatus[] ss = fs.listStatus(new Path(indexdir));
    for (FileStatus fileStatus : ss) {
        sb.append(fileStatus.getPath().toString()).append(",");
    }//  w  ww .  ja va  2  s  .c  o m
    IndexMergeMR.running(sb.substring(0, sb.length() - 1), indexdir1, conf);

    IFormatDataFile ifdf = new IFormatDataFile(conf);
    ifdf.open(indexdir1 + "/part-00000");
    for (int i = 0; i < 100; i++) {
        ifdf.next().show();
    }

    ifdf.close();

    fs.delete(new Path(indexdir), true);
    fs.delete(new Path(indexdir1), true);

}

From source file:Vectors.java

License:Apache License

public static Vector readSequenceFile(Path path, Configuration conf) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    for (FileStatus fileStatus : fs.listStatus(path)) {
        if (fileStatus.getPath().getName().contains("part-")) {
            SequenceFile.Reader reader = null;
            try {
                reader = new SequenceFile.Reader(fs, fileStatus.getPath(), conf);
                Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
                VectorWritable value = (VectorWritable) ReflectionUtils.newInstance(reader.getValueClass(),
                        conf);//from w  w w  .  java  2  s .  com
                reader.next(key, value);
                return value.get();
            } finally {
                IOUtils.closeStream(reader);
            }
        }
    }
    return null;
}

From source file:ClassifierHD.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(//from  w w w . ja  va  2 s.co m
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        Class.forName("org.postgresql.Driver");
        conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres",
                "dbwpsdkdl");
        conn.setAutoCommit(false);
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
                continue;
            }
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');
                }
                lv_cnt++;
            }
            br.close();

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }

            ts.end();
            ts.close();

            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
            }
            //System.out.println(message);
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            pstmt.addBatch();
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
        }
        pstmt.executeBatch();
        //pstmt.clearParameters();
        pstmt.close();
        conn.commit();
        conn.close();
        bw.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
    analyzer.close();
}

From source file:AggregatedLogsPurger.java

License:Apache License

public boolean purge() throws IOException {
    LocalDateTime now = LocalDateTime.now();
    LocalDateTime deleteLogsOlderThanTime = now.minusDays(deleteOlderThanDays);

    //Identify which log dirs should be deleted
    FileSystem fs = rootLogDir.getFileSystem(conf);
    try {// ww  w .j a  v a 2  s  . c  om

        long totalBytes = 0;
        for (FileStatus userDir : fs.listStatus(rootLogDir)) {
            if (userDir.isDirectory()) {
                Path userDirPath = new Path(userDir.getPath(), suffix);
                System.out.println("Checking for userDir : " + userDirPath);
                for (FileStatus appDir : fs.listStatus(userDirPath)) {
                    LocalDateTime appDirDate = getAppDirDateTime(appDir.getModificationTime());
                    if (appDirDate.isBefore(deleteLogsOlderThanTime)) {
                        long size = getLengthRecursively(fs, appDir.getPath());
                        System.out.println(appDir.getPath() + ", " + appDir.getOwner() + ", "
                                + appDirDate.toString() + ", size=" + size);
                        totalBytes += size;
                        if (shouldDelete) {
                            System.out.println("Deleting " + appDir.getPath());
                            fs.delete(appDir.getPath(), true);
                        }
                    }
                }
            }
        }
        System.out.println("Savings : " + totalBytes);
    } catch (IOException e) {
        e.printStackTrace();
        return false;
    } finally {
        fs.close();
    }
    return true;
}