com.norconex.collector.http.data.store.impl.jdbc.JDBCCrawlDataSerializer.java Source code

Java tutorial

Introduction

Here is the source code for com.norconex.collector.http.data.store.impl.jdbc.JDBCCrawlDataSerializer.java

Source

/* Copyright 2010-2014 Norconex Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.norconex.collector.http.data.store.impl.jdbc;

import java.math.BigDecimal;
import java.sql.ResultSet;
import java.sql.SQLException;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;

import com.norconex.collector.core.data.ICrawlData;
import com.norconex.collector.core.data.store.impl.jdbc.JDBCCrawlDataStore;
import com.norconex.collector.core.data.store.impl.jdbc.IJDBCSerializer;
import com.norconex.collector.http.data.HttpCrawlData;
import com.norconex.collector.http.data.HttpCrawlState;

public class JDBCCrawlDataSerializer implements IJDBCSerializer {

    private static final int TAG_MAX_LENGTH = 1024;
    private static final int TEXT_MAX_LENGTH = 2048;
    private static final int TITLE_MAX_LENGTH = 2048;

    protected static final String ALL_FIELDS =
            // common attributes:
            "reference, " + "parentRootReference, " + "isRootParentReference, " + "state, " + "metaChecksum, "
                    + "contentChecksum, "
                    // http-specific:
                    + "depth, " + "sitemapLastMod, " + "sitemapChangeFreq, " + "sitemapPriority, "
                    + "referrerReference, " + "referrerLinkTag, " + "referrerLinkText, " + "referrerLinkTitle ";

    @Override
    public String[] getCreateTableSQLs(String table) {
        String sql = "CREATE TABLE " + table + " ("
        // common attributes:
                + "reference VARCHAR(32672) NOT NULL, " + "parentRootReference VARCHAR(32672), "
                + "isRootParentReference BOOLEAN, " + "state VARCHAR(256), " + "metaChecksum VARCHAR(32672), "
                + "contentChecksum VARCHAR(32672), "
                // http-specific:
                + "depth INTEGER NOT NULL, " + "sitemapLastMod BIGINT, " + "sitemapChangeFreq VARCHAR(7), "
                + "sitemapPriority FLOAT, " + "referrerReference VARCHAR(32672), "
                + "referrerLinkTag VARCHAR(1024), " + "referrerLinkText VARCHAR(2048), "
                + "referrerLinkTitle VARCHAR(2048), " + "PRIMARY KEY (reference))";

        String[] sqls = new String[] { sql };
        if (JDBCCrawlDataStore.TABLE_QUEUE.equals(table)) {
            sqls = ArrayUtils.add(sqls, "CREATE INDEX orderindex ON queue(depth)");
        }
        return sqls;
    }

    @Override
    public String getSelectCrawlDataSQL(String table) {
        return "SELECT " + ALL_FIELDS + "FROM " + table;
    }

    @Override
    public String getDeleteCrawlDataSQL(String table) {
        return "DELETE FROM " + table + " WHERE reference = ?";
    }

    public Object[] getDeleteCrawlDataValues(String table, ICrawlData crawlURL) {
        return new Object[] { crawlURL.getReference() };
    }

    @Override
    public String getInsertCrawlDataSQL(String table) {
        return "INSERT INTO " + table + "(" + ALL_FIELDS + ") values (?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
    }

    @Override
    public Object[] getInsertCrawlDataValues(String table, ICrawlData crawlData) {
        HttpCrawlData data = (HttpCrawlData) crawlData;
        return new Object[] { data.getReference(), data.getParentRootReference(), data.isRootParentReference(),
                data.getState().toString(), data.getMetaChecksum(), data.getContentChecksum(), data.getDepth(),
                data.getSitemapLastMod(), data.getSitemapChangeFreq(), data.getSitemapPriority(),
                data.getReferrerReference(), StringUtils.substring(data.getReferrerLinkTag(), 0, TAG_MAX_LENGTH),
                StringUtils.substring(data.getReferrerLinkText(), 0, TEXT_MAX_LENGTH),
                StringUtils.substring(data.getReferrerLinkTitle(), 0, TITLE_MAX_LENGTH), };
    }

    @Override
    public String getNextQueuedCrawlDataSQL() {
        return "SELECT " + ALL_FIELDS + "FROM " + JDBCCrawlDataStore.TABLE_QUEUE + " ORDER BY depth";
    }

    @Override
    public Object[] getNextQueuedCrawlDataValues() {
        return null;
    }

    @Override
    public String getCachedCrawlDataSQL() {
        return "SELECT " + ALL_FIELDS + "FROM " + JDBCCrawlDataStore.TABLE_CACHE + " WHERE reference = ? ";
    }

    @Override
    public Object[] getCachedCrawlDataValues(String reference) {
        return new Object[] { reference };
    }

    @Override
    public String getReferenceExistsSQL(String table) {
        return "SELECT 1 FROM " + table + " WHERE reference = ?";
    }

    @Override
    public Object[] getReferenceExistsValues(String table, String reference) {
        return new Object[] { reference };
    }

    @Override
    public ICrawlData toCrawlData(String table, ResultSet rs) throws SQLException {
        if (rs == null) {
            return null;
        }
        HttpCrawlData data = new HttpCrawlData();
        data.setReference(rs.getString("reference"));
        data.setParentRootReference(rs.getString("parentRootReference"));
        data.setRootParentReference(rs.getBoolean("isRootParentReference"));
        data.setState(HttpCrawlState.valueOf(rs.getString("state")));
        data.setMetaChecksum(rs.getString("metaChecksum"));
        data.setDocumentChecksum(rs.getString("contentChecksum"));
        data.setDepth(rs.getInt("depth"));
        BigDecimal bigLM = rs.getBigDecimal("sitemapLastMod");
        if (bigLM != null) {
            data.setSitemapLastMod(bigLM.longValue());
        }
        BigDecimal bigP = rs.getBigDecimal("sitemapPriority");
        if (bigP != null) {
            data.setSitemapPriority(bigP.floatValue());
        }
        data.setSitemapChangeFreq(rs.getString("sitemapChangeFreq"));
        data.setReferrerReference(rs.getString("referrerReference"));
        data.setReferrerLinkTag(rs.getString("referrerLinkTag"));
        data.setReferrerLinkText(rs.getString("referrerLinkText"));
        data.setReferrerLinkTitle(rs.getString("referrerLinkTitle"));

        return data;
    }
}