TestURLUtil.java :  » Web-Crawler » nutch » org » apache » nutch » util » Java Open Source

Java Open Source » Web Crawler » nutch 
nutch » org » apache » nutch » util » TestURLUtil.java
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.util;

import java.net.URL;

import junit.framework.TestCase;

/** Test class for URLUtil */
public class TestURLUtil
  extends TestCase {

  @Override
  protected void setUp()
    throws Exception {
    super.setUp();
  }

  public void testGetDomainName()
    throws Exception {

    URL url = null;

    url = new URL("http://lucene.apache.org/nutch");
    assertEquals("apache.org", URLUtil.getDomainName(url));

    url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
    assertEquals("wikipedia.org", URLUtil.getDomainName(url));

    url = new URL("http://140.211.11.130/foundation/contributing.html");
    assertEquals("140.211.11.130", URLUtil.getDomainName(url));

    url = new URL("http://www.example.co.uk:8080/index.html");
    assertEquals("example.co.uk", URLUtil.getDomainName(url));

    url = new URL("http://com");
    assertEquals("com", URLUtil.getDomainName(url));

    url = new URL("http://www.example.co.uk.com");
    assertEquals("uk.com", URLUtil.getDomainName(url));

    // "nn" is not a tld
    url = new URL("http://example.com.nn");
    assertEquals("nn", URLUtil.getDomainName(url));

    url = new URL("http://");
    assertEquals("", URLUtil.getDomainName(url));

    url = new URL("http://www.edu.tr.xyz");
    assertEquals("xyz", URLUtil.getDomainName(url));

    url = new URL("http://www.example.c.se");
    assertEquals("example.c.se", URLUtil.getDomainName(url));

    // plc.co.im is listed as a domain suffix
    url = new URL("http://www.example.plc.co.im");
    assertEquals("example.plc.co.im", URLUtil.getDomainName(url));

    // 2000.hu is listed as a domain suffix
    url = new URL("http://www.example.2000.hu");
    assertEquals("example.2000.hu", URLUtil.getDomainName(url));

    // test non-ascii
    url = new URL("http://www.example..tw");
    assertEquals("example..tw", URLUtil.getDomainName(url));

  }

  public void testGetDomainSuffix()
    throws Exception {
    URL url = null;

    url = new URL("http://lucene.apache.org/nutch");
    assertEquals("org", URLUtil.getDomainSuffix(url).getDomain());

    url = new URL("http://140.211.11.130/foundation/contributing.html");
    assertNull(URLUtil.getDomainSuffix(url));

    url = new URL("http://www.example.co.uk:8080/index.html");
    assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain());

    url = new URL("http://com");
    assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());

    url = new URL("http://www.example.co.uk.com");
    assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());

    // "nn" is not a tld
    url = new URL("http://example.com.nn");
    assertNull(URLUtil.getDomainSuffix(url));

    url = new URL("http://");
    assertNull(URLUtil.getDomainSuffix(url));

    url = new URL("http://www.edu.tr.xyz");
    assertNull(URLUtil.getDomainSuffix(url));

    url = new URL("http://subdomain.example.edu.tr");
    assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain());

    url = new URL("http://subdomain.example.presse.fr");
    assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain());

    url = new URL("http://subdomain.example.presse.tr");
    assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain());

    // plc.co.im is listed as a domain suffix
    url = new URL("http://www.example.plc.co.im");
    assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain());

    // 2000.hu is listed as a domain suffix
    url = new URL("http://www.example.2000.hu");
    assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain());

    // test non-ascii
    url = new URL("http://www.example..tw");
    assertEquals(".tw", URLUtil.getDomainSuffix(url).getDomain());

  }

  public void testGetHostSegments()
    throws Exception {
    URL url;
    String[] segments;

    url = new URL("http://subdomain.example.edu.tr");
    segments = URLUtil.getHostSegments(url);
    assertEquals("subdomain", segments[0]);
    assertEquals("example", segments[1]);
    assertEquals("edu", segments[2]);
    assertEquals("tr", segments[3]);

    url = new URL("http://");
    segments = URLUtil.getHostSegments(url);
    assertEquals(1, segments.length);
    assertEquals("", segments[0]);

    url = new URL("http://140.211.11.130/foundation/contributing.html");
    segments = URLUtil.getHostSegments(url);
    assertEquals(1, segments.length);
    assertEquals("140.211.11.130", segments[0]);

    // test non-ascii
    url = new URL("http://www.example..tw");
    segments = URLUtil.getHostSegments(url);
    assertEquals("www", segments[0]);
    assertEquals("example", segments[1]);
    assertEquals("", segments[2]);
    assertEquals("tw", segments[3]);

  }

  public void testChooseRepr()
    throws Exception {
    
    String aDotCom = "http://www.a.com";
    String bDotCom = "http://www.b.com";
    String aSubDotCom = "http://www.news.a.com";
    String aQStr = "http://www.a.com?y=1";
    String aPath = "http://www.a.com/xyz/index.html";
    String aPath2 = "http://www.a.com/abc/page.html";
    String aPath3 = "http://www.news.a.com/abc/page.html";
    
    // 1) different domain them keep dest, temp or perm
    // a.com -> b.com*
    assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, true));
    assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, false));
    
    // 2) permanent and root, keep src
    // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
    assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aQStr, false));
    assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, false));
    
    //3) permanent and not root and dest root, keep dest
    //a.com/xyz/index.html -> a.com*
    assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, false));
    
    //4) permanent and neither root keep dest
    // a.com/xyz/index.html -> a.com/abc/page.html*
    assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, false));
    
    //5) temp and root and dest not root keep src
    //*a.com -> a.com/xyz/index.html
    assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, true));
    
    //6) temp and not root and dest root keep dest
    // a.com/xyz/index.html -> a.com*
    assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, true));

    //7) temp and neither root, keep shortest, if hosts equal by path else by hosts
    //  a.com/xyz/index.html -> a.com/abc/page.html*
    // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
    assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, true));
    assertEquals(aPath, URLUtil.chooseRepr(aPath, aPath3, true));

    //8) temp and both root keep shortest sub domain
    // *www.a.com -> www.news.a.com
    assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.