com.jaeksoft.searchlib.crawler.cache.HadoopCrawlCache.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.crawler.cache.HadoopCrawlCache.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.crawler.cache;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.poi.util.IOUtils;
import org.json.JSONException;
import org.json.JSONObject;

import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.util.ReadWriteLock;

public class HadoopCrawlCache extends CrawlCacheProvider {

    private final static String PATH_HTTP_DOWNLOAD_CACHE = Path.SEPARATOR + "opensearchserver" + Path.SEPARATOR
            + "http-download-cache";

    private final static String META_EXTENSION = "meta";

    private final static String CONTENT_EXTENSION = "content";

    private final ReadWriteLock rwl = new ReadWriteLock();

    private FileSystem fileSystem;

    private Configuration configuration;

    public HadoopCrawlCache() {
        super(CrawlCacheProviderEnum.HADOOP);
        configuration = null;
        fileSystem = null;
    }

    private String[] configFiles = { "core-default.xml", "core-site.xml" };

    @Override
    public void init(String configString) throws IOException {
        rwl.w.lock();
        try {
            closeNoLock();
            configuration = new Configuration();
            for (String configFile : configFiles)
                configuration.addResource(new Path(configString, configFile));
            fileSystem = FileSystem.get(configuration);
        } finally {
            rwl.w.unlock();
        }
    }

    final private void closeNoLock() {
        if (fileSystem != null) {
            IOUtils.closeQuietly(fileSystem);
            fileSystem = null;
        }
    }

    @Override
    public void close() {
        rwl.w.lock();
        try {
            closeNoLock();
        } finally {
            rwl.w.unlock();
        }
    }

    @Override
    public String getInfos() throws IOException {
        rwl.r.lock();
        try {
            if (configuration == null)
                return null;
            return configuration.toString();
        } finally {
            rwl.r.unlock();
        }
    }

    private Path uriToPath(URI uri, String extension) throws UnsupportedEncodingException {
        String path = super.uriToPath(uri, PATH_HTTP_DOWNLOAD_CACHE, 10, Path.SEPARATOR, extension, 32);
        return new Path(path);
    }

    @Override
    public InputStream store(DownloadItem downloadItem) throws IOException, JSONException {
        rwl.r.lock();
        try {
            URI uri = downloadItem.getUri();

            Path path = checkPath(uriToPath(uri, META_EXTENSION));
            write(path, downloadItem.getMetaAsJson());
            path = checkPath(uriToPath(uri, CONTENT_EXTENSION));
            InputStream is = downloadItem.getContentInputStream();
            write(path, is);
            IOUtils.closeQuietly(is);
            return fileSystem.open(path);
        } finally {
            rwl.r.unlock();
        }
    }

    @Override
    public DownloadItem load(URI uri, long expirationTime) throws IOException, JSONException, URISyntaxException {
        rwl.r.lock();
        try {
            checkFileSystemAvailable();
            Path path = uriToPath(uri, META_EXTENSION);
            if (!fileSystem.exists(path))
                return null;
            if (expirationTime != 0)
                if (fileSystem.getFileStatus(path).getModificationTime() < expirationTime)
                    return null;
            String content = read(path);
            JSONObject json = new JSONObject(content);
            DownloadItem downloadItem = new DownloadItem(uri);
            downloadItem.loadMetaFromJson(json);
            path = uriToPath(uri, CONTENT_EXTENSION);
            downloadItem.setContentInputStream(fileSystem.open(path));
            return downloadItem;
        } finally {
            rwl.r.unlock();
        }
    }

    @Override
    public boolean flush(URI uri) throws IOException {
        rwl.r.lock();
        try {
            checkFileSystemAvailable();
            Path path = uriToPath(uri, META_EXTENSION);
            boolean deleted = false;
            if (fileSystem.exists(path))
                deleted = fileSystem.delete(path, false) || deleted;
            path = uriToPath(uri, CONTENT_EXTENSION);
            if (fileSystem.exists(path))
                deleted = fileSystem.delete(path, false) || deleted;
            return deleted;
        } finally {
            rwl.r.unlock();
        }
    }

    private final long purge(FileStatus[] files, long expiration) throws IOException {
        long count = 0;
        for (FileStatus file : files) {
            if (file.isDirectory()) {
                Path p = file.getPath();
                count += purge(fileSystem.listStatus(p), expiration);
                FileStatus[] fs = fileSystem.listStatus(p);
                if (fs.length == 0)
                    if (fileSystem.delete(p, false))
                        count++;
            } else {
                if (file.getModificationTime() < expiration)
                    if (fileSystem.delete(file.getPath(), false))
                        count++;
            }
        }
        return count;
    }

    private void checkFileSystemAvailable() throws IOException {
        if (fileSystem == null)
            throw new IOException("File system not configured");
    }

    @Override
    public long flush(long expiration) throws IOException {
        rwl.r.lock();
        try {
            checkFileSystemAvailable();
            Path path = new Path(PATH_HTTP_DOWNLOAD_CACHE);
            return purge(fileSystem.listStatus(path), expiration);
        } finally {
            rwl.r.unlock();
        }
    }

    private String read(Path path) throws IOException {
        FSDataInputStream in = fileSystem.open(path);
        try {
            return in.readUTF();
        } finally {
            IOUtils.closeQuietly(in);
        }
    }

    private Path checkPath(Path path) throws IOException {
        if (!fileSystem.exists(path)) {
            Path parent = path.getParent();
            if (!fileSystem.exists(parent))
                fileSystem.mkdirs(parent);
        }
        return path;
    }

    private void write(Path path, String content) throws IOException {
        FSDataOutputStream out = fileSystem.create(path, true);
        try {
            out.writeUTF(content);
        } finally {
            IOUtils.closeQuietly(out);
        }
    }

    private void write(Path path, InputStream in) throws IOException {
        FSDataOutputStream out = fileSystem.create(path, true);
        try {
            IOUtils.copy(in, out);
        } finally {
            IOUtils.closeQuietly(out);
        }
    }

    @Override
    public String getConfigurationInformation() {
        return "Please provide the path to the Hadoop configuration (etc) folder";
    }

}