List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java
License:Open Source License
/** * <code>addFileMetadata</code> stores the pathname, size, owner, etc. *///from w w w .j a v a2 s . c om void addFileMetadata(final FileStatus fstatus, final long crawlId) { // Compute strings to represent file metadata Path insertFile = fstatus.getPath(); final boolean isDir = fstatus.isDir(); FsPermission fsp = fstatus.getPermission(); final String permissions = (isDir ? "d" : "-") + fsp.getUserAction().SYMBOL + fsp.getGroupAction().SYMBOL + fsp.getOtherAction().SYMBOL; // Compute formal pathname representation String fnameString = null; String parentPathString = null; if (isDir && insertFile.getParent() == null) { parentPathString = ""; fnameString = insertFile.toString(); } else { fnameString = insertFile.getName(); parentPathString = insertFile.getParent().toString(); // REMIND --- mjc --- If we want to modify the Files table s.t. it does // not contain the filesystem prefix, then this would be the place to do it. if (!parentPathString.endsWith("/")) { parentPathString = parentPathString + "/"; } } final String parentPath = parentPathString; final String fName = fnameString; final long fileId = dbQueue.execute(new SQLiteJob<Long>() { protected Long job(SQLiteConnection db) throws SQLiteException { SQLiteStatement stmt = db.prepare("INSERT into Files VALUES(null, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); try { stmt.bind(1, isDir ? "True" : "False").bind(2, crawlId).bind(3, fName) .bind(4, fstatus.getOwner()).bind(5, fstatus.getGroup()).bind(6, permissions) .bind(7, fstatus.getLen()) .bind(8, fileDateFormat.format(new Date(fstatus.getModificationTime()))) .bind(9, parentPath); stmt.step(); return db.getLastInsertId(); } finally { stmt.dispose(); } } }).complete(); }
From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java
License:Open Source License
/** * Get the parents for the given directory from a given crawl *///from ww w . j av a 2 s .co m public List<FileSummary> getDirParents(final long crawlid, final String targetDirStr) { return dbQueue.execute(new SQLiteJob<List<FileSummary>>() { protected List<FileSummary> job(SQLiteConnection db) throws SQLiteException { List<FileSummary> output = new ArrayList<FileSummary>(); SQLiteStatement stmt = db.prepare( "select fid, path, fname from Files WHERE crawlid = ? AND length(?) > length(path||fname) AND isDir = 'True' AND replace(?, path||fname, '') LIKE '/%'"); try { Path targetDir = new Path(targetDirStr); if (targetDir.getParent() != null) { stmt.bind(1, crawlid).bind(2, targetDir.toString()).bind(3, targetDir.toString()); while (stmt.step()) { //Path p = new Path(stmt.columnString(0) + stmt.columnString(1)); output.add(new FileSummary(FSAnalyzer.this, stmt.columnLong(0))); } } } finally { stmt.dispose(); } return output; } }).complete(); }
From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java
License:Open Source License
/** * <code>getStartNonblockingCrawl</code> traverses a given filesystem. It returns immediately * and does not wait for the crawl to complete. * If the crawl is created or is already ongoing, it returns true. * If the crawl is not currently going and cannot start, it returns false. *//*from ww w.j a v a2 s . c o m*/ public synchronized boolean getStartNonblockingCrawl(final URI fsURI) { try { final int subdirDepth = INFINITE_CRAWL_DEPTH; long fsId = analyzer.getCreateFilesystem(fsURI, true); if (fsId < 0) { return false; } LOG.info("Grabbing filesystem: " + fsURI); final FileSystem fs = FileSystem.get(fsURI, new Configuration()); final Path startDir = fs.makeQualified(new Path(fsURI.getPath())); final long crawlid = analyzer.getCreatePendingCrawl(fsId, true); Thread pendingThread = pendingCrawls.get(crawlid); if (pendingThread == null) { Thread t = new Thread() { public void run() { try { synchronized (pendingCrawls) { pendingCrawls.put(crawlid, this); } synchronized (crawlStatusInfo) { crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl")); } // Build the file and dir-level todo lists List<Path> todoFileList = new ArrayList<Path>(); List<Path> todoDirList = new ArrayList<Path>(); recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList); // Get the files to process TreeSet<String> observedFilenames = new TreeSet<String>(); for (Path p : analyzer.getFilesForCrawl(crawlid)) { observedFilenames.add(p.toString()); } for (Iterator<Path> it = todoFileList.iterator(); it.hasNext();) { Path p = it.next(); if (observedFilenames.contains(p.toString())) { it.remove(); } } // Get the dirs to process TreeSet<String> observedDirnames = new TreeSet<String>(); for (Path p : analyzer.getDirsForCrawl(crawlid)) { observedDirnames.add(p.toString()); } for (Iterator<Path> it = todoDirList.iterator(); it.hasNext();) { Path p = it.next(); if (observedDirnames.contains(p.toString())) { it.remove(); } } synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setMessage("Processing files"); cstatus.setNumToProcess(todoFileList.size()); cstatus.setNumDone(0); } int numDone = 0; for (Path p : todoDirList) { try { analyzer.addSingleFile(fs, p, crawlid); } catch (IOException iex) { iex.printStackTrace(); } } for (Path p : todoFileList) { synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setMessage("Processing file " + p.toString()); } try { analyzer.addSingleFile(fs, p, crawlid); } catch (Exception iex) { iex.printStackTrace(); } numDone++; synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setNumDone(numDone); if (cstatus.shouldFinish()) { break; } } } } catch (IOException iex) { iex.printStackTrace(); } finally { try { synchronized (pendingCrawls) { pendingCrawls.remove(crawlid); analyzer.completeCrawl(crawlid); } } catch (SQLiteException sle) { } } } }; t.start(); } return true; } catch (Exception iex) { iex.printStackTrace(); } return false; }
From source file:com.cloudera.recordbreaker.analyzer.GenericDataDescriptor.java
License:Open Source License
public String getHiveImportDataStatement(String tablename, Path importFile) { String fname = importFile.toString(); String localMarker = ""; if (fname.startsWith("file")) { localMarker = "local "; }/*from w ww . ja v a 2s. com*/ String loadTxt = "load data " + localMarker + "inpath '" + importFile + "' overwrite into table " + tablename; return loadTxt; }
From source file:com.cloudera.recordbreaker.fisheye.FishEye.java
License:Open Source License
public String getTopDir() { URI fsUri = getFSURI();/*from w w w . j a v a2 s. c om*/ if (fsUri == null) { return null; } long fsid = analyzer.getCreateFilesystem(fsUri, false); if (fsid >= 0) { long crawlid = analyzer.getLatestCompleteCrawl(fsid); if (crawlid >= 0) { Path td = analyzer.getTopDir(crawlid); return td.toString(); } } return null; }
From source file:com.cloudera.recordbreaker.learnstructure.LearnStructure.java
License:Open Source License
public static void main(String argv[]) throws IOException { if (argv.length < 2) { System.err.println("Usage: LearnStructure <input-datafile> <outdir> (-emitAvro (true)|false)"); return;// ww w .j a va2 s .c o m } FileSystem localFS = FileSystem.getLocal(new Configuration()); boolean emitAvro = true; int i = 0; Path f = new Path(new File(argv[i++]).getCanonicalPath()); File outdir = new File(argv[i++]).getCanonicalFile(); for (; i < argv.length; i++) { if ("-emitAvro".equals(argv[i])) { i++; emitAvro = "true".equals(argv[i]); } } System.err.println("Input file: " + f.toString()); System.err.println("Output directory: " + outdir.getCanonicalPath()); if (outdir.exists()) { throw new IOException("Output directory already exists: " + outdir); } outdir.mkdirs(); Path schemaFile = new Path(outdir.getCanonicalPath(), SCHEMA_FILENAME); Path parseTreeFile = new Path(outdir.getCanonicalPath(), PARSER_FILENAME); Path jsonDataFile = null; Path avroDataFile = null; if (emitAvro) { jsonDataFile = new Path(outdir.getCanonicalPath(), JSONDATA_FILENAME); avroDataFile = new Path(outdir.getCanonicalPath(), DATA_FILENAME); } LearnStructure ls = new LearnStructure(); ls.inferRecordFormat(localFS, f, localFS, schemaFile, parseTreeFile, jsonDataFile, avroDataFile, true, -1); }
From source file:com.cloudera.recordbreaker.learnstructure.test.InferenceTest.java
License:Open Source License
/** * runSingletonTest() executes LearnStructure test for a single given input text file. * * @param inputData a <code>File</code> value * @return a <code>boolean</code> value; did the test succeed? *///from w w w. j a v a 2 s .c om boolean runSingletonTest(File workingDir, File inputData) { File tmpSingletonDir = new File(workingDir, "testinference-" + inputData.getName()); try { FileSystem localFS = FileSystem.getLocal(new Configuration()); tmpSingletonDir.mkdir(); Path schemaFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.SCHEMA_FILENAME); Path parseTreeFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.PARSER_FILENAME); Path jsonDataFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.JSONDATA_FILENAME); Path avroFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.DATA_FILENAME); LearnStructure ls = new LearnStructure(); // Check to see how many records exist in the original input int lineCount = 0; BufferedReader in2 = new BufferedReader(new FileReader(inputData)); try { while (in2.readLine() != null) { lineCount++; } } finally { in2.close(); } // Infer structure ls.inferRecordFormat(localFS, new Path(inputData.getCanonicalPath()), localFS, schemaFile, parseTreeFile, jsonDataFile, avroFile, false, lineCount); // Test the inferred structure // First, load in the avro file and see how many records there are. int avroCount = 0; DataFileReader in = new DataFileReader(new File(avroFile.toString()), new GenericDatumReader()); try { Iterator it = in.iterator(); while (it.hasNext()) { avroCount++; it.next(); } } finally { in.close(); } // Was the synthesized parser able to figure out the file? double parseRatio = avroCount / (1.0 * lineCount); return (parseRatio > MIN_PARSE_RATIO); } catch (IOException e) { try { System.err.println("File: " + inputData.getCanonicalPath()); } catch (IOException ex) { ex.printStackTrace(); } e.printStackTrace(); return false; } finally { // remove temp files tmpSingletonDir.delete(); } }
From source file:com.cloudera.recordservice.examples.terasort.TeraSort.java
License:Apache License
@Override public int run(String[] args) throws Exception { boolean useRecordService = false; if (args.length != 2 && args.length != 3) { usage();// w ww.j a v a2 s .co m return 1; } if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } LOG.info("starting"); Job job = Job.getInstance(getConf()); boolean useSimplePartitioner = getUseSimplePartitioner(job); if (useRecordService) { RecordServiceConfig.setInputTable(job.getConfiguration(), null, args[0]); job.setInputFormatClass(RecordServiceTeraInputFormat.class); useSimplePartitioner = true; } else { Path inputDir = new Path(args[0]); TeraInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TeraInputFormat.class); } Path outputDir = new Path(args[1]); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TeraOutputFormat.class); if (useSimplePartitioner) { job.setPartitionerClass(SimplePartitioner.class); } else { long start = System.currentTimeMillis(); Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); try { TeraInputFormat.writePartitionFile(job, partitionFile); } catch (Throwable e) { LOG.error(e.getMessage()); return -1; } job.addCacheFile(partitionUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions."); job.setPartitionerClass(TotalOrderPartitioner.class); } job.getConfiguration().setInt("dfs.replication", getOutputReplication(job)); TeraOutputFormat.setFinalSync(job, true); int ret = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return ret; }
From source file:com.cloudera.recordservice.mr.RecordServiceConfig.java
License:Apache License
/** * Set the array of {@link Path}s as the list of inputs * for the map-reduce job.//ww w . j a v a2 s.c om */ public static void setInputPaths(Configuration conf, Path... inputPaths) throws IOException { Path path = inputPaths[0].getFileSystem(conf).makeQualified(inputPaths[0]); StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString())); for (int i = 1; i < inputPaths.length; ++i) { str.append(StringUtils.COMMA_STR); path = inputPaths[i].getFileSystem(conf).makeQualified(inputPaths[i]); str.append(StringUtils.escapeString(path.toString())); } conf.set("mapred.input.dir", str.toString()); }
From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java
License:Apache License
/** * Set the path to the SequenceFile storing the sorted partition keyset. * It must be the case that for <tt>R</tt> reduces, there are <tt>R-1</tt> * keys in the SequenceFile.//from w w w . ja v a 2s .co m */ public static void setPartitionFile(Configuration conf, Path p) { conf.set(PARTITIONER_PATH, p.toString()); }