List of usage examples for org.apache.hadoop.fs FileSystem makeQualified
public Path makeQualified(Path path)
From source file:com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository.java
License:Apache License
/** * Get a {@link com.cloudera.cdk.data.PartitionKey} corresponding to a partition's filesystem path * represented as a {@link URI}. If the path is not a valid partition, * then {@link IllegalArgumentException} is thrown. Note that the partition does not * have to exist./*from w w w. jav a2 s. c o m*/ * @param dataset the filesystem dataset * @param partitionPath a directory path where the partition data is stored * @return a partition key representing the partition at the given path * @since 0.4.0 */ @SuppressWarnings("deprecation") public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) { Preconditions.checkState(dataset.getDescriptor().isPartitioned(), "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName()); Preconditions.checkArgument(dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset"); FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileSystem fs = fsDataset.getFileSystem(); URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri(); URI directoryUri = fsDataset.getDirectory().toUri(); URI relativizedUri = directoryUri.relativize(partitionUri); if (relativizedUri.equals(partitionUri)) { throw new IllegalArgumentException( String.format("Partition URI %s has different " + "root directory to dataset (directory: %s).", partitionUri, directoryUri)); } Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath()); PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners(); if (Iterables.size(parts) > fieldPartitioners.size()) { throw new IllegalArgumentException( String.format("Too many partition directories " + "for %s (%s), expecting %s.", partitionUri, Iterables.size(parts), fieldPartitioners.size())); } List<Object> values = Lists.newArrayList(); int i = 0; for (String part : parts) { Iterator<String> split = Splitter.on('=').split(part).iterator(); String fieldName = split.next(); FieldPartitioner fp = fieldPartitioners.get(i++); if (!fieldName.equals(fp.getName())) { throw new IllegalArgumentException( String.format("Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.", fieldName, partitionUri, fp.getName())); } if (!split.hasNext()) { throw new IllegalArgumentException(String .format("Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri)); } String stringValue = split.next(); Object value = fp.valueFromString(stringValue); values.add(value); } return com.cloudera.cdk.data.impl.Accessor.getDefault() .newPartitionKey(values.toArray(new Object[values.size()])); }
From source file:com.cloudera.cdk.data.hcatalog.HiveUtils.java
License:Apache License
static DatasetDescriptor descriptorForTable(Configuration conf, Table table) { final DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder(); final String serializationLib = table.getSerializationLib(); if (SERDE_TO_FORMAT.containsKey(serializationLib)) { builder.format(SERDE_TO_FORMAT.get(serializationLib)); } else {//ww w . ja va 2 s .co m // TODO: should this use an "unknown" format? others fail in open() throw new UnknownFormatException("Unknown format for serde:" + serializationLib); } final Path dataLocation = new Path(table.getDataLocation()); final FileSystem fs = fsForPath(conf, dataLocation); builder.location(fs.makeQualified(dataLocation)); // custom properties String namesProperty = table.getProperty(CUSTOM_PROPERTIES_PROPERTY_NAME); if (namesProperty != null) { for (String property : NAME_SPLITTER.split(namesProperty)) { builder.property(property, table.getProperty(property)); } } if (table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME) != null) { builder.partitionStrategy( Accessor.getDefault().fromExpression(table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME))); } String schemaUrlString = table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME); if (schemaUrlString != null) { try { builder.schemaUri(new URI(schemaUrlString)); } catch (IOException e) { throw new MetadataProviderException("Could not read schema", e); } catch (URISyntaxException e) { // this library sets the URI, so it should always be valid throw new MetadataProviderException("[BUG] Invalid schema URI", e); } } String schemaLiteral = table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME); if (schemaLiteral != null) { builder.schemaLiteral(schemaLiteral); } try { return builder.build(); } catch (IllegalStateException ex) { throw new MetadataProviderException("Cannot find schema: missing metadata"); } }
From source file:com.cloudera.cdk.data.TestDatasetDescriptor.java
License:Apache License
@Test public void testSchemaFromHdfs() throws IOException { FileSystem fs = getDFS(); // copy a schema to HDFS Path schemaPath = fs.makeQualified(new Path("schema.avsc")); FSDataOutputStream out = fs.create(schemaPath); IOUtils.copyBytes(USER_SCHEMA_URL.toURL().openStream(), out, fs.getConf()); out.close();/*w ww. j a v a 2 s. c o m*/ // build a schema using the HDFS path and check it's the same Schema schema = new DatasetDescriptor.Builder().schemaUri(schemaPath.toUri()).build().getSchema(); Assert.assertEquals(USER_SCHEMA, schema); }
From source file:com.cloudera.impala.common.FileSystemUtil.java
License:Apache License
/** * Return true iff the path is on the given filesystem. *//*from w ww. java 2 s. c o m*/ public static Boolean isPathOnFileSystem(Path path, FileSystem fs) { try { // Call makeQualified() for the side-effect of FileSystem.checkPath() which will // throw an exception if path is not on fs. fs.makeQualified(path); return true; } catch (IllegalArgumentException e) { // Path is not on fs. return false; } }
From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java
License:Open Source License
private void addOperatorInputs(Map<String, LocalResource> localResources) throws IOException { LOG.info("Inputs: " + operator.getInputFiles()); FileSystem fs = FileSystem.get(conf); for (Entry<String, String> e : operator.getInputFiles().entrySet()) { if ((!e.getValue().startsWith("hdfs://")) && (!e.getValue().startsWith("$HDFS"))) { LOG.info("adding local resource: " + e); String inDir = dir;//from ww w .j a va 2s . c om LocalResource rsrc = Records.newRecord(LocalResource.class); rsrc.setType(LocalResourceType.FILE); rsrc.setVisibility(LocalResourceVisibility.APPLICATION); LOG.info("Adding input: " + inDir + "/" + e.getValue()); Path dst = new Path(inDir + "/" + e.getValue()); dst = fs.makeQualified(dst); FileStatus stat = fs.getFileStatus(dst); rsrc.setSize(stat.getLen()); rsrc.setTimestamp(stat.getModificationTime()); rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); localResources.put(e.getKey(), rsrc); } } /*for(String in : operator.getArguments().split(" ")){ LOG.info("Adding input: "+in); LocalResource nl = constructScriptResource(); localResources.put(in, nl); }*/ }
From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java
License:Open Source License
private void configureLocalScriptResourceForPath(LocalResource rsrc, Path path) throws IOException { //System.out.println("URI: "+path.toUri()); FileSystem fs = FileSystem.get(conf); Path dst = new Path(dir + "/" + path.getName()); fs.moveFromLocalFile(path, dst);/* w w w .j a v a2 s. co m*/ dst = fs.makeQualified(dst); FileStatus stat = fs.getFileStatus(dst); rsrc.setSize(stat.getLen()); rsrc.setTimestamp(stat.getModificationTime()); rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst)); }
From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java
License:Open Source License
/** * @return paths from {@link FileStatus}es into one comma-separated String * @see FileInputFormat#addInputPath(org.apache.hadoop.mapreduce.Job, Path) *//* ww w .j ava2 s. c o m*/ private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) { StringBuilder joined = new StringBuilder(); for (FileStatus status : statuses) { if (joined.length() > 0) { joined.append(','); } Path path = fs.makeQualified(status.getPath()); joined.append(StringUtils.escapeString(path.toString())); } return joined.toString(); }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData, JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString, TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException { Objects.requireNonNull(newKeyMessageData); JavaRDD<M> newData = newKeyMessageData.values(); JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values(); if (newData != null) { newData.cache();//from ww w .j av a2 s. co m // This forces caching of the RDD. This shouldn't be necessary but we see some freezes // when many workers try to materialize the RDDs at once. Hence the workaround. newData.foreachPartition(p -> { }); } if (pastData != null) { pastData.cache(); pastData.foreachPartition(p -> { }); } List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues(); int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates); List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates, valuesPerHyperParam); Path modelDir = new Path(modelDirString); Path tempModelPath = new Path(modelDir, ".temporary"); Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis())); FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration()); fs.mkdirs(candidatesPath); Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos, candidatesPath); Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis())); if (bestCandidatePath == null) { log.info("Unable to build any model"); } else { // Move best model into place fs.rename(bestCandidatePath, finalPath); } // Then delete everything else fs.delete(candidatesPath, true); if (modelUpdateTopic == null) { log.info("No update topic configured, not publishing models to a topic"); } else { // Push PMML model onto update topic, if it exists Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME); if (fs.exists(bestModelPath)) { FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath); PMML bestModel = null; boolean modelNeededForUpdates = canPublishAdditionalModelData(); boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize; if (modelNeededForUpdates || modelNotTooLarge) { // Either the model is required for publishAdditionalModelData, or required because it's going to // be serialized to Kafka try (InputStream in = fs.open(bestModelPath)) { bestModel = PMMLUtils.read(in); } } if (modelNotTooLarge) { modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel)); } else { modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString()); } if (modelNeededForUpdates) { publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic); } } } if (newData != null) { newData.unpersist(); } if (pastData != null) { pastData.unpersist(); } }
From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java
License:Open Source License
/** * <code>getStartNonblockingCrawl</code> traverses a given filesystem. It returns immediately * and does not wait for the crawl to complete. * If the crawl is created or is already ongoing, it returns true. * If the crawl is not currently going and cannot start, it returns false. *///w w w .j av a2 s .c o m public synchronized boolean getStartNonblockingCrawl(final URI fsURI) { try { final int subdirDepth = INFINITE_CRAWL_DEPTH; long fsId = analyzer.getCreateFilesystem(fsURI, true); if (fsId < 0) { return false; } LOG.info("Grabbing filesystem: " + fsURI); final FileSystem fs = FileSystem.get(fsURI, new Configuration()); final Path startDir = fs.makeQualified(new Path(fsURI.getPath())); final long crawlid = analyzer.getCreatePendingCrawl(fsId, true); Thread pendingThread = pendingCrawls.get(crawlid); if (pendingThread == null) { Thread t = new Thread() { public void run() { try { synchronized (pendingCrawls) { pendingCrawls.put(crawlid, this); } synchronized (crawlStatusInfo) { crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl")); } // Build the file and dir-level todo lists List<Path> todoFileList = new ArrayList<Path>(); List<Path> todoDirList = new ArrayList<Path>(); recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList); // Get the files to process TreeSet<String> observedFilenames = new TreeSet<String>(); for (Path p : analyzer.getFilesForCrawl(crawlid)) { observedFilenames.add(p.toString()); } for (Iterator<Path> it = todoFileList.iterator(); it.hasNext();) { Path p = it.next(); if (observedFilenames.contains(p.toString())) { it.remove(); } } // Get the dirs to process TreeSet<String> observedDirnames = new TreeSet<String>(); for (Path p : analyzer.getDirsForCrawl(crawlid)) { observedDirnames.add(p.toString()); } for (Iterator<Path> it = todoDirList.iterator(); it.hasNext();) { Path p = it.next(); if (observedDirnames.contains(p.toString())) { it.remove(); } } synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setMessage("Processing files"); cstatus.setNumToProcess(todoFileList.size()); cstatus.setNumDone(0); } int numDone = 0; for (Path p : todoDirList) { try { analyzer.addSingleFile(fs, p, crawlid); } catch (IOException iex) { iex.printStackTrace(); } } for (Path p : todoFileList) { synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setMessage("Processing file " + p.toString()); } try { analyzer.addSingleFile(fs, p, crawlid); } catch (Exception iex) { iex.printStackTrace(); } numDone++; synchronized (crawlStatusInfo) { CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid); cstatus.setNumDone(numDone); if (cstatus.shouldFinish()) { break; } } } } catch (IOException iex) { iex.printStackTrace(); } finally { try { synchronized (pendingCrawls) { pendingCrawls.remove(crawlid); analyzer.completeCrawl(crawlid); } } catch (SQLiteException sle) { } } } }; t.start(); } return true; } catch (Exception iex) { iex.printStackTrace(); } return false; }
From source file:com.conductor.hadoop.WritableValueInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(final JobContext context) throws IOException, InterruptedException { final Configuration conf = context.getConfiguration(); // init the reader final String filePath = conf.get(INPUT_FILE_LOCATION_CONF); checkArgument(!Strings.isNullOrEmpty(filePath), "Missing property: " + INPUT_FILE_LOCATION_CONF); final FileSystem fs = getFileSystem(conf); final Path path = fs.makeQualified(new Path(filePath)); final SequenceFile.Reader reader = getReader(conf, path); // create the splits by looping through the values of the input file int totalInputs = 0; int maxInputsPerSplit = conf.getInt(INPUTS_PER_SPLIT_CONF, DEFAULT_INPUTS_PER_SPLIT); long pos = 0L; long last = 0L; long lengthRemaining = fs.getFileStatus(path).getLen(); final List<InputSplit> splits = Lists.newArrayList(); final V value = getV(conf); for (final NullWritable key = NullWritable.get(); reader.next(key, value); last = reader.getPosition()) { if (++totalInputs % maxInputsPerSplit == 0) { long splitSize = last - pos; splits.add(new FileSplit(path, pos, splitSize, null)); lengthRemaining -= splitSize; pos = last;// w w w . j av a 2 s . c om } } // create the last split if there is data remaining if (lengthRemaining != 0) { splits.add(new FileSplit(path, pos, lengthRemaining, null)); } return splits; }