Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.asterix.external.input; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; import org.apache.asterix.common.exceptions.AsterixException; import org.apache.asterix.external.api.AsterixInputStream; import org.apache.asterix.external.api.IExternalIndexer; import org.apache.asterix.external.api.IIndexibleExternalDataSource; import org.apache.asterix.external.api.IRecordReader; import org.apache.asterix.external.api.IRecordReaderFactory; import org.apache.asterix.external.indexing.ExternalFile; import org.apache.asterix.external.indexing.IndexingScheduler; import org.apache.asterix.external.input.record.reader.IndexingStreamRecordReader; import org.apache.asterix.external.input.record.reader.hdfs.HDFSRecordReader; import org.apache.asterix.external.input.record.reader.stream.StreamRecordReader; import org.apache.asterix.external.input.stream.HDFSInputStream; import org.apache.asterix.external.provider.ExternalIndexerProvider; import org.apache.asterix.external.provider.StreamRecordReaderProvider; import org.apache.asterix.external.provider.StreamRecordReaderProvider.Format; import org.apache.asterix.external.util.ExternalDataConstants; import org.apache.asterix.external.util.ExternalDataUtils; import org.apache.asterix.external.util.HDFSUtils; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hyracks.algebricks.common.constraints.AlgebricksAbsolutePartitionConstraint; import org.apache.hyracks.api.context.IHyracksTaskContext; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.hdfs.dataflow.ConfFactory; import org.apache.hyracks.hdfs.dataflow.InputSplitsFactory; import org.apache.hyracks.hdfs.scheduler.Scheduler; public class HDFSDataSourceFactory implements IRecordReaderFactory<Object>, IIndexibleExternalDataSource { protected static final long serialVersionUID = 1L; protected transient AlgebricksAbsolutePartitionConstraint clusterLocations; protected String[] readSchedule; protected boolean read[]; protected InputSplitsFactory inputSplitsFactory; protected ConfFactory confFactory; protected boolean configured = false; protected static Scheduler hdfsScheduler; protected static IndexingScheduler indexingScheduler; protected static Boolean initialized = false; protected static Object initLock = new Object(); protected List<ExternalFile> files; protected Map<String, String> configuration; protected Class<?> recordClass; protected boolean indexingOp = false; private JobConf conf; private InputSplit[] inputSplits; private String nodeName; private Format format; @Override public void configure(Map<String, String> configuration) throws AsterixException { try { init(); this.configuration = configuration; JobConf conf = HDFSUtils.configureHDFSJobConf(configuration); confFactory = new ConfFactory(conf); clusterLocations = getPartitionConstraint(); int numPartitions = clusterLocations.getLocations().length; // if files list was set, we restrict the splits to the list InputSplit[] inputSplits; if (files == null) { inputSplits = conf.getInputFormat().getSplits(conf, numPartitions); } else { inputSplits = HDFSUtils.getSplits(conf, files); } if (indexingOp) { readSchedule = indexingScheduler.getLocationConstraints(inputSplits); } else { readSchedule = hdfsScheduler.getLocationConstraints(inputSplits); } inputSplitsFactory = new InputSplitsFactory(inputSplits); read = new boolean[readSchedule.length]; Arrays.fill(read, false); String formatString = configuration.get(ExternalDataConstants.KEY_FORMAT); if (formatString == null || formatString.equals(ExternalDataConstants.FORMAT_HDFS_WRITABLE)) { RecordReader<?, ?> reader = conf.getInputFormat().getRecordReader(inputSplits[0], conf, Reporter.NULL); this.recordClass = reader.createValue().getClass(); reader.close(); } else { format = StreamRecordReaderProvider.getReaderFormat(configuration); this.recordClass = char[].class; } } catch (IOException e) { throw new AsterixException(e); } } // Used to tell the factory to restrict the splits to the intersection between this list a // actual files on hde @Override public void setSnapshot(List<ExternalFile> files, boolean indexingOp) { this.files = files; this.indexingOp = indexingOp; } /* * The method below was modified to take care of the following * 1. when target files are not null, it generates a file aware input stream that validate * against the files * 2. if the data is binary, it returns a generic reader */ public AsterixInputStream createInputStream(IHyracksTaskContext ctx, int partition, IExternalIndexer indexer) throws HyracksDataException { try { if (!configured) { conf = confFactory.getConf(); inputSplits = inputSplitsFactory.getSplits(); nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); configured = true; } return new HDFSInputStream(read, inputSplits, readSchedule, nodeName, conf, configuration, files, indexer); } catch (Exception e) { throw new HyracksDataException(e); } } /** * Get the cluster locations for this input stream factory. This method specifies on which asterix nodes the * external * adapter will run and how many threads per node. * * @return */ @Override public AlgebricksAbsolutePartitionConstraint getPartitionConstraint() { clusterLocations = HDFSUtils.getPartitionConstraints(clusterLocations); return clusterLocations; } /** * This method initialize the scheduler which assigns responsibility of reading different logical input splits from * HDFS */ private static void init() { if (!initialized) { synchronized (initLock) { if (!initialized) { hdfsScheduler = HDFSUtils.initializeHDFSScheduler(); indexingScheduler = HDFSUtils.initializeIndexingHDFSScheduler(); initialized = true; } } } } public JobConf getJobConf() throws HyracksDataException { return confFactory.getConf(); } @Override public DataSourceType getDataSourceType() { return ExternalDataUtils.getDataSourceType(configuration); } /** * HDFS Datasource is a special case in two ways: * 1. It supports indexing. * 2. It returns input as a set of writable object that we sometimes internally transform into a byte stream * Hence, it can produce: * 1. StreamRecordReader: When we transform the input into a byte stream. * 2. Indexing Stream Record Reader: When we transform the input into a byte stream and perform indexing. * 3. HDFS Record Reader: When we simply pass the Writable object as it is to the parser. */ @Override public IRecordReader<? extends Object> createRecordReader(IHyracksTaskContext ctx, int partition) throws HyracksDataException { try { IExternalIndexer indexer = files == null ? null : ExternalIndexerProvider.getIndexer(configuration); if (format != null) { StreamRecordReader streamReader = StreamRecordReaderProvider.createRecordReader(format, createInputStream(ctx, partition, indexer), configuration); if (indexer != null) { return new IndexingStreamRecordReader(streamReader, indexer); } else { return streamReader; } } JobConf conf = confFactory.getConf(); InputSplit[] inputSplits = inputSplitsFactory.getSplits(); String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); return new HDFSRecordReader<Object, Writable>(read, inputSplits, readSchedule, nodeName, conf, files, indexer); } catch (Exception e) { throw new HyracksDataException(e); } } @Override public Class<?> getRecordClass() { return recordClass; } @Override public boolean isIndexible() { return true; } @Override public boolean isIndexingOp() { return ((files != null) && indexingOp); } }