eu.matejkormuth.crawler2.Document.java Source code

Introduction

Here is the source code for eu.matejkormuth.crawler2.Document.java
Source

/*
 *  crawler2 - crawler for java
 *  Copyright (C) 2015 Matej Kormuth 
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *  
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package eu.matejkormuth.crawler2;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;

import org.apache.commons.io.FilenameUtils;
import org.apache.http.entity.ContentType;

import eu.matejkormuth.crawler2.documents.BinaryDocument;
import eu.matejkormuth.crawler2.documents.HtmlDocument;

/**
 * Represents page / file that has been fetched from server.
 */
public abstract class Document {

    protected URL url;
    protected Charset encoding;

    protected Document(String contentEncoding) {
        this.encoding = Charset.forName(contentEncoding);
    }

    static Document create(String contentType, String contentEncoding, long contentLength, InputStream content) {
        if (ContentType.TEXT_HTML.getMimeType().equalsIgnoreCase(contentType)) {
            return createHtmlDocument(contentType, contentEncoding, contentLength, content);
        } else {
            return createBinaryDocument(contentType, contentEncoding, contentLength, content);
        }
    }

    private static Document createBinaryDocument(String contentType, String contentEncoding, long contentLength,
            InputStream content) {
        return new BinaryDocument(contentType, contentEncoding, contentLength, content);
    }

    private static Document createHtmlDocument(String contentType, String contentEncoding, long contentLength,
            InputStream content) {
        return new HtmlDocument(contentType, contentEncoding, contentLength, content);
    }

    /**
     * Returns URL that this document has been fetched from.
     * 
     * @return url of this document
     */
    public URL getUrl() {
        return url;
    }

    /**
     * Returns byte array representation of this document content.
     * 
     * @return content of this document
     */
    public abstract byte[] getContent();

    /**
     * Returns content type of this document.
     * 
     * @return content type of this document
     */
    public abstract ContentType getContentType();

    /**
     * Returns Charset / encoding used in this document.
     * 
     * @return doucment's encoding / charset
     */
    public Charset getEncoding() {
        return this.encoding;
    }

    /**
     * Saves this document to specified path.
     * 
     * @param path
     *            path to save document at
     */
    public void saveTo(Path path) {
        try {
            Files.write(path, this.getContent());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Returns the name of this document. Beware that this can returns empty
     * string for index documents with URL like <code>example.com/page/</code>.
     * 
     * @return name of this file
     */
    public String getName() {
        return FilenameUtils.getName(this.url.toString());
    }

    /**
     * Returns the path of this document on server. This has same effect as
     * calling <code>getUrl().getPath()</code>.
     * 
     * @return path of this document on server
     */
    public String getPath() {
        return this.url.getPath();
    }

    /**
     * Returns extension of this document. Beware that this can returns empty
     * string for index documents with URL like <code>example.com/page/</code>.
     * 
     * @return extension of this document (file)
     */
    public String getExtension() {
        return FilenameUtils.getExtension(this.url.toString());
    }

}