com.datatorrent.flume.source.HdfsTestSource.java Source code

Java tutorial

Introduction

Here is the source code for com.datatorrent.flume.source.HdfsTestSource.java

Source

/**
 * Copyright (c) 2016 DataTorrent, Inc. ALL Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.datatorrent.flume.source;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;

import javax.annotation.Nonnull;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDrivenSource;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.conf.Configurable;
import org.apache.flume.event.EventBuilder;
import org.apache.flume.source.AbstractSource;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;

/**
 * <p>TestSource class.</p>
 *
 * @since 0.9.4
 */
public class HdfsTestSource extends AbstractSource implements EventDrivenSource, Configurable {
    public static final String SOURCE_DIR = "sourceDir";
    public static final String RATE = "rate";
    public static final String INIT_DATE = "initDate";

    static byte FIELD_SEPARATOR = 2;
    public Timer emitTimer;
    @Nonnull
    String directory;
    Path directoryPath;
    int rate;
    String initDate;
    long initTime;
    List<String> dataFiles;
    long oneDayBack;

    private transient BufferedReader br = null;
    protected transient FileSystem fs;
    private transient Configuration configuration;

    private transient int currentFile = 0;
    private transient boolean finished;
    private List<Event> events;

    public HdfsTestSource() {
        super();
        this.rate = 2500;
        dataFiles = Lists.newArrayList();
        Calendar calendar = Calendar.getInstance();
        calendar.add(Calendar.DATE, -1);
        oneDayBack = calendar.getTimeInMillis();
        configuration = new Configuration();
        events = Lists.newArrayList();
    }

    @Override
    public void configure(Context context) {
        directory = context.getString(SOURCE_DIR);
        rate = context.getInteger(RATE, rate);
        initDate = context.getString(INIT_DATE);

        Preconditions.checkArgument(!Strings.isNullOrEmpty(directory));
        directoryPath = new Path(directory);

        String[] parts = initDate.split("-");
        Preconditions.checkArgument(parts.length == 3);
        Calendar calendar = Calendar.getInstance();
        calendar.set(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]) - 1, Integer.parseInt(parts[2]), 0, 0,
                0);
        initTime = calendar.getTimeInMillis();

        try {
            List<String> files = findFiles();
            for (String file : files) {
                dataFiles.add(file);
            }
            if (logger.isDebugEnabled()) {
                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
                logger.debug("settings {} {} {} {} {}", directory, rate, dateFormat.format(oneDayBack),
                        dateFormat.format(new Date(initTime)), currentFile);
                for (String file : dataFiles) {
                    logger.debug("settings add file {}", file);
                }
            }

            fs = FileSystem.newInstance(new Path(directory).toUri(), configuration);
            Path filePath = new Path(dataFiles.get(currentFile));
            br = new BufferedReader(new InputStreamReader(new GzipCompressorInputStream(fs.open(filePath))));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        finished = true;

    }

    private List<String> findFiles() throws IOException {
        List<String> files = Lists.newArrayList();
        Path directoryPath = new Path(directory);
        FileSystem lfs = FileSystem.newInstance(directoryPath.toUri(), configuration);
        try {
            logger.debug("checking for new files in {}", directoryPath);
            RemoteIterator<LocatedFileStatus> statuses = lfs.listFiles(directoryPath, true);
            for (; statuses.hasNext();) {
                FileStatus status = statuses.next();
                Path path = status.getPath();
                String filePathStr = path.toString();
                if (!filePathStr.endsWith(".gz")) {
                    continue;
                }
                logger.debug("new file {}", filePathStr);
                files.add(path.toString());
            }
        } catch (FileNotFoundException e) {
            logger.warn("Failed to list directory {}", directoryPath, e);
            throw new RuntimeException(e);
        } finally {
            lfs.close();
        }
        return files;
    }

    @Override
    public void start() {
        super.start();
        emitTimer = new Timer();

        final ChannelProcessor channelProcessor = getChannelProcessor();
        emitTimer.scheduleAtFixedRate(new TimerTask() {
            @Override
            public void run() {
                int lineCount = 0;
                events.clear();
                try {
                    while (lineCount < rate && !finished) {
                        String line = br.readLine();

                        if (line == null) {
                            logger.debug("completed file {}", currentFile);
                            br.close();
                            currentFile++;
                            if (currentFile == dataFiles.size()) {
                                logger.info("finished all files");
                                finished = true;
                                break;
                            }
                            Path filePath = new Path(dataFiles.get(currentFile));
                            br = new BufferedReader(
                                    new InputStreamReader(new GzipCompressorInputStream(fs.open(filePath))));
                            logger.info("opening file {}. {}", currentFile, filePath);
                            continue;
                        }
                        lineCount++;
                        Event flumeEvent = EventBuilder.withBody(line.getBytes());
                        events.add(flumeEvent);
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                if (events.size() > 0) {
                    channelProcessor.processEventBatch(events);
                }
                if (finished) {
                    emitTimer.cancel();
                }
            }

        }, 0, 1000);
    }

    @Override
    public void stop() {
        emitTimer.cancel();
        super.stop();
    }

    private static final Logger logger = LoggerFactory.getLogger(HdfsTestSource.class);
}