Example usage for org.apache.hadoop.fs Path toString

List of usage examples for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java

License:Open Source License

/**
 * <code>addFileMetadata</code> stores the pathname, size, owner, etc.
 *///from w  w  w .j  a v  a2 s .  c  om
void addFileMetadata(final FileStatus fstatus, final long crawlId) {
    // Compute strings to represent file metadata
    Path insertFile = fstatus.getPath();
    final boolean isDir = fstatus.isDir();
    FsPermission fsp = fstatus.getPermission();
    final String permissions = (isDir ? "d" : "-") + fsp.getUserAction().SYMBOL + fsp.getGroupAction().SYMBOL
            + fsp.getOtherAction().SYMBOL;

    // Compute formal pathname representation
    String fnameString = null;
    String parentPathString = null;
    if (isDir && insertFile.getParent() == null) {
        parentPathString = "";
        fnameString = insertFile.toString();
    } else {
        fnameString = insertFile.getName();
        parentPathString = insertFile.getParent().toString();

        // REMIND --- mjc --- If we want to modify the Files table s.t. it does
        // not contain the filesystem prefix, then this would be the place to do it.

        if (!parentPathString.endsWith("/")) {
            parentPathString = parentPathString + "/";
        }
    }
    final String parentPath = parentPathString;
    final String fName = fnameString;
    final long fileId = dbQueue.execute(new SQLiteJob<Long>() {
        protected Long job(SQLiteConnection db) throws SQLiteException {
            SQLiteStatement stmt = db.prepare("INSERT into Files VALUES(null, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
            try {
                stmt.bind(1, isDir ? "True" : "False").bind(2, crawlId).bind(3, fName)
                        .bind(4, fstatus.getOwner()).bind(5, fstatus.getGroup()).bind(6, permissions)
                        .bind(7, fstatus.getLen())
                        .bind(8, fileDateFormat.format(new Date(fstatus.getModificationTime())))
                        .bind(9, parentPath);
                stmt.step();
                return db.getLastInsertId();
            } finally {
                stmt.dispose();
            }
        }
    }).complete();
}

From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java

License:Open Source License

/**
 * Get the parents for the given directory from a given crawl
 *///from   ww  w  . j  av  a 2  s  .co m
public List<FileSummary> getDirParents(final long crawlid, final String targetDirStr) {
    return dbQueue.execute(new SQLiteJob<List<FileSummary>>() {
        protected List<FileSummary> job(SQLiteConnection db) throws SQLiteException {
            List<FileSummary> output = new ArrayList<FileSummary>();
            SQLiteStatement stmt = db.prepare(
                    "select fid, path, fname from Files WHERE crawlid = ? AND length(?) > length(path||fname) AND isDir = 'True' AND replace(?, path||fname, '') LIKE '/%'");
            try {
                Path targetDir = new Path(targetDirStr);
                if (targetDir.getParent() != null) {
                    stmt.bind(1, crawlid).bind(2, targetDir.toString()).bind(3, targetDir.toString());
                    while (stmt.step()) {
                        //Path p = new Path(stmt.columnString(0) + stmt.columnString(1));
                        output.add(new FileSummary(FSAnalyzer.this, stmt.columnLong(0)));
                    }
                }
            } finally {
                stmt.dispose();
            }
            return output;
        }
    }).complete();
}

From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java

License:Open Source License

/**
 * <code>getStartNonblockingCrawl</code> traverses a given filesystem.  It returns immediately
 * and does not wait for the crawl to complete.
 * If the crawl is created or is already ongoing, it returns true.
 * If the crawl is not currently going and cannot start, it returns false. 
 *//*from  ww  w.j  a  v  a2  s  .  c  o  m*/
public synchronized boolean getStartNonblockingCrawl(final URI fsURI) {
    try {
        final int subdirDepth = INFINITE_CRAWL_DEPTH;
        long fsId = analyzer.getCreateFilesystem(fsURI, true);
        if (fsId < 0) {
            return false;
        }
        LOG.info("Grabbing filesystem: " + fsURI);
        final FileSystem fs = FileSystem.get(fsURI, new Configuration());
        final Path startDir = fs.makeQualified(new Path(fsURI.getPath()));

        final long crawlid = analyzer.getCreatePendingCrawl(fsId, true);
        Thread pendingThread = pendingCrawls.get(crawlid);
        if (pendingThread == null) {
            Thread t = new Thread() {
                public void run() {
                    try {
                        synchronized (pendingCrawls) {
                            pendingCrawls.put(crawlid, this);
                        }
                        synchronized (crawlStatusInfo) {
                            crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl"));
                        }
                        // Build the file and dir-level todo lists
                        List<Path> todoFileList = new ArrayList<Path>();
                        List<Path> todoDirList = new ArrayList<Path>();
                        recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList);

                        // Get the files to process
                        TreeSet<String> observedFilenames = new TreeSet<String>();
                        for (Path p : analyzer.getFilesForCrawl(crawlid)) {
                            observedFilenames.add(p.toString());
                        }
                        for (Iterator<Path> it = todoFileList.iterator(); it.hasNext();) {
                            Path p = it.next();
                            if (observedFilenames.contains(p.toString())) {
                                it.remove();
                            }
                        }

                        // Get the dirs to process
                        TreeSet<String> observedDirnames = new TreeSet<String>();
                        for (Path p : analyzer.getDirsForCrawl(crawlid)) {
                            observedDirnames.add(p.toString());
                        }
                        for (Iterator<Path> it = todoDirList.iterator(); it.hasNext();) {
                            Path p = it.next();
                            if (observedDirnames.contains(p.toString())) {
                                it.remove();
                            }
                        }

                        synchronized (crawlStatusInfo) {
                            CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                            cstatus.setMessage("Processing files");
                            cstatus.setNumToProcess(todoFileList.size());
                            cstatus.setNumDone(0);
                        }

                        int numDone = 0;
                        for (Path p : todoDirList) {
                            try {
                                analyzer.addSingleFile(fs, p, crawlid);
                            } catch (IOException iex) {
                                iex.printStackTrace();
                            }
                        }
                        for (Path p : todoFileList) {
                            synchronized (crawlStatusInfo) {
                                CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                cstatus.setMessage("Processing file " + p.toString());
                            }
                            try {
                                analyzer.addSingleFile(fs, p, crawlid);
                            } catch (Exception iex) {
                                iex.printStackTrace();
                            }
                            numDone++;
                            synchronized (crawlStatusInfo) {
                                CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                cstatus.setNumDone(numDone);
                                if (cstatus.shouldFinish()) {
                                    break;
                                }
                            }
                        }
                    } catch (IOException iex) {
                        iex.printStackTrace();
                    } finally {
                        try {
                            synchronized (pendingCrawls) {
                                pendingCrawls.remove(crawlid);
                                analyzer.completeCrawl(crawlid);
                            }
                        } catch (SQLiteException sle) {
                        }
                    }
                }
            };
            t.start();
        }
        return true;
    } catch (Exception iex) {
        iex.printStackTrace();
    }
    return false;
}

From source file:com.cloudera.recordbreaker.analyzer.GenericDataDescriptor.java

License:Open Source License

public String getHiveImportDataStatement(String tablename, Path importFile) {
    String fname = importFile.toString();
    String localMarker = "";
    if (fname.startsWith("file")) {
        localMarker = "local ";
    }/*from  w ww  .  ja v a 2s. com*/
    String loadTxt = "load data " + localMarker + "inpath '" + importFile + "' overwrite into table "
            + tablename;
    return loadTxt;
}

From source file:com.cloudera.recordbreaker.fisheye.FishEye.java

License:Open Source License

public String getTopDir() {
    URI fsUri = getFSURI();/*from   w w  w  .  j a  v a2 s.  c  om*/
    if (fsUri == null) {
        return null;
    }
    long fsid = analyzer.getCreateFilesystem(fsUri, false);
    if (fsid >= 0) {
        long crawlid = analyzer.getLatestCompleteCrawl(fsid);
        if (crawlid >= 0) {
            Path td = analyzer.getTopDir(crawlid);
            return td.toString();
        }
    }
    return null;
}

From source file:com.cloudera.recordbreaker.learnstructure.LearnStructure.java

License:Open Source License

public static void main(String argv[]) throws IOException {
    if (argv.length < 2) {
        System.err.println("Usage: LearnStructure <input-datafile> <outdir> (-emitAvro (true)|false)");
        return;// ww  w .j  a va2  s .c  o m
    }
    FileSystem localFS = FileSystem.getLocal(new Configuration());
    boolean emitAvro = true;
    int i = 0;
    Path f = new Path(new File(argv[i++]).getCanonicalPath());
    File outdir = new File(argv[i++]).getCanonicalFile();
    for (; i < argv.length; i++) {
        if ("-emitAvro".equals(argv[i])) {
            i++;
            emitAvro = "true".equals(argv[i]);
        }
    }

    System.err.println("Input file: " + f.toString());
    System.err.println("Output directory: " + outdir.getCanonicalPath());
    if (outdir.exists()) {
        throw new IOException("Output directory already exists: " + outdir);
    }
    outdir.mkdirs();
    Path schemaFile = new Path(outdir.getCanonicalPath(), SCHEMA_FILENAME);
    Path parseTreeFile = new Path(outdir.getCanonicalPath(), PARSER_FILENAME);
    Path jsonDataFile = null;
    Path avroDataFile = null;
    if (emitAvro) {
        jsonDataFile = new Path(outdir.getCanonicalPath(), JSONDATA_FILENAME);
        avroDataFile = new Path(outdir.getCanonicalPath(), DATA_FILENAME);
    }

    LearnStructure ls = new LearnStructure();
    ls.inferRecordFormat(localFS, f, localFS, schemaFile, parseTreeFile, jsonDataFile, avroDataFile, true, -1);
}

From source file:com.cloudera.recordbreaker.learnstructure.test.InferenceTest.java

License:Open Source License

/**
 * runSingletonTest() executes LearnStructure test for a single given input text file.
 *
 * @param inputData a <code>File</code> value
 * @return a <code>boolean</code> value;  did the test succeed?
 *///from  w w  w.  j  a v a 2 s .c  om
boolean runSingletonTest(File workingDir, File inputData) {
    File tmpSingletonDir = new File(workingDir, "testinference-" + inputData.getName());
    try {
        FileSystem localFS = FileSystem.getLocal(new Configuration());
        tmpSingletonDir.mkdir();
        Path schemaFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.SCHEMA_FILENAME);
        Path parseTreeFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.PARSER_FILENAME);
        Path jsonDataFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.JSONDATA_FILENAME);
        Path avroFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.DATA_FILENAME);

        LearnStructure ls = new LearnStructure();
        // Check to see how many records exist in the original input
        int lineCount = 0;
        BufferedReader in2 = new BufferedReader(new FileReader(inputData));
        try {
            while (in2.readLine() != null) {
                lineCount++;
            }
        } finally {
            in2.close();
        }

        // Infer structure
        ls.inferRecordFormat(localFS, new Path(inputData.getCanonicalPath()), localFS, schemaFile,
                parseTreeFile, jsonDataFile, avroFile, false, lineCount);

        // Test the inferred structure
        // First, load in the avro file and see how many records there are.
        int avroCount = 0;
        DataFileReader in = new DataFileReader(new File(avroFile.toString()), new GenericDatumReader());
        try {
            Iterator it = in.iterator();
            while (it.hasNext()) {
                avroCount++;
                it.next();
            }
        } finally {
            in.close();
        }

        // Was the synthesized parser able to figure out the file?
        double parseRatio = avroCount / (1.0 * lineCount);
        return (parseRatio > MIN_PARSE_RATIO);
    } catch (IOException e) {
        try {
            System.err.println("File: " + inputData.getCanonicalPath());
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        e.printStackTrace();
        return false;
    } finally {
        // remove temp files
        tmpSingletonDir.delete();
    }
}

From source file:com.cloudera.recordservice.examples.terasort.TeraSort.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    boolean useRecordService = false;
    if (args.length != 2 && args.length != 3) {
        usage();//  w ww.j a  v  a2 s  .co  m
        return 1;
    }
    if (args.length == 3) {
        useRecordService = Boolean.parseBoolean(args[2]);
    }

    LOG.info("starting");
    Job job = Job.getInstance(getConf());
    boolean useSimplePartitioner = getUseSimplePartitioner(job);

    if (useRecordService) {
        RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]);
        job.setInputFormatClass(RecordServiceTeraInputFormat.class);
        useSimplePartitioner = true;
    } else {
        Path inputDir = new Path(args[0]);
        TeraInputFormat.setInputPaths(job, inputDir);
        job.setInputFormatClass(TeraInputFormat.class);
    }

    Path outputDir = new Path(args[1]);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TeraOutputFormat.class);
    if (useSimplePartitioner) {
        job.setPartitionerClass(SimplePartitioner.class);
    } else {
        long start = System.currentTimeMillis();
        Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME);
        URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
        try {
            TeraInputFormat.writePartitionFile(job, partitionFile);
        } catch (Throwable e) {
            LOG.error(e.getMessage());
            return -1;
        }
        job.addCacheFile(partitionUri);
        long end = System.currentTimeMillis();
        System.out.println("Spent " + (end - start) + "ms computing partitions.");
        job.setPartitionerClass(TotalOrderPartitioner.class);
    }

    job.getConfiguration().setInt("dfs.replication", getOutputReplication(job));
    TeraOutputFormat.setFinalSync(job, true);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    LOG.info("done");
    return ret;
}

From source file:com.cloudera.recordservice.mr.RecordServiceConfig.java

License:Apache License

/**
 * Set the array of {@link Path}s as the list of inputs
 * for the map-reduce job.//ww w .  j a  v  a2  s.c  om
 */
public static void setInputPaths(Configuration conf, Path... inputPaths) throws IOException {
    Path path = inputPaths[0].getFileSystem(conf).makeQualified(inputPaths[0]);
    StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString()));
    for (int i = 1; i < inputPaths.length; ++i) {
        str.append(StringUtils.COMMA_STR);
        path = inputPaths[i].getFileSystem(conf).makeQualified(inputPaths[i]);
        str.append(StringUtils.escapeString(path.toString()));
    }
    conf.set("mapred.input.dir", str.toString());
}

From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java

License:Apache License

/**
   * Set the path to the SequenceFile storing the sorted partition keyset.
   * It must be the case that for <tt>R</tt> reduces, there are <tt>R-1</tt>
   * keys in the SequenceFile.//from   w  w  w  . ja  v  a  2s  .co m
   */
  public static void setPartitionFile(Configuration conf, Path p) {
      conf.set(PARTITIONER_PATH, p.toString());
  }