Java tutorial
/** * The MIT License (MIT) * * Copyright (C) 2014 Agile Knowledge Engineering and Semantic Web (AKSW) (usbeck@informatik.uni-leipzig.de) * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package org.aksw.gerbil.datasets.datahub; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.aksw.gerbil.config.GerbilConfiguration; import org.aksw.gerbil.datasets.datahub.model.Dataset; import org.aksw.gerbil.datasets.datahub.model.Resource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.http.HttpHeaders; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.web.client.RestTemplate; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; public class DatahubNIFLoader { private static final Logger LOGGER = LoggerFactory.getLogger(DatahubNIFLoader.class); private static final String DATAHUB_NIF_CORPUS_META_INF_URL_PROPERTY_NAME = "org.aksw.gerbil.datasets.DatahubNIFLoader.metaInfURL"; private static final String DATAHUB_TAG_INF_URL_PROPERTY_NAME = "org.aksw.gerbil.datasets.DatahubNIFLoader.tagInfURL"; private static final String DATAHUB_NEEDED_TAGS_ARRAY_PROPERTY_NAME = "org.aksw.gerbil.datasets.DatahubNIFLoader.corpusTags"; private RestTemplate rt; private String neededTags[]; private Map<String, String> datasets; public DatahubNIFLoader() { rt = new RestTemplate(); init(); } private void init() { neededTags = GerbilConfiguration.getInstance().getStringArray(DATAHUB_NEEDED_TAGS_ARRAY_PROPERTY_NAME); if (neededTags == null) { LOGGER.error("Couldn't load the needed property \"{}\".", DATAHUB_NEEDED_TAGS_ARRAY_PROPERTY_NAME); neededTags = new String[0]; } List<String> nifDataSets = getNIFDataSets(); getNIFDataSetsMetaInformation(nifDataSets); } private void getNIFDataSetsMetaInformation(List<String> nifDataSets) { datasets = Maps.newHashMap(); String nifCorpusMetaInfURL = GerbilConfiguration.getInstance() .getString(DATAHUB_NIF_CORPUS_META_INF_URL_PROPERTY_NAME); if (nifCorpusMetaInfURL == null) { LOGGER.error("Couldn't load the needed property \"{}\". Aborting.", DATAHUB_NIF_CORPUS_META_INF_URL_PROPERTY_NAME); return; } // go through all datasets tagged with nif for (String d : nifDataSets) { ResponseEntity<Dataset.Response> entity = rt.getForEntity(nifCorpusMetaInfURL + d, Dataset.Response.class); if (entity.getStatusCode().equals(HttpStatus.OK)) { Dataset.Response body = entity.getBody(); List<Resource> resources = body.getResult().getResources(); // go through the downloadable Resources for (Resource r : resources) { String url = r.getUrl(); LOGGER.debug("checking {}", url); HttpHeaders headers = rt.headForHeaders(url); long contentLength = headers.getContentLength(); LOGGER.debug("{} bytes", contentLength); // FIXME - put the magic number in application.properties // add if less than 20mb ends with ttl (turtle) but not with dataid.ttl (we aint gonna need it yet) if (contentLength < 20_000_000 && url.endsWith(".ttl") && !url.endsWith("dataid.ttl")) { LOGGER.debug("{}: {} has less than 20mb and is turtle > add to Dataset", d, url); datasets.put(d, url); } } } } } private List<String> getNIFDataSets() { List<String> result = Lists.newArrayList(); String taggedCorpusURL = GerbilConfiguration.getInstance().getString(DATAHUB_TAG_INF_URL_PROPERTY_NAME); if (taggedCorpusURL == null) { LOGGER.error("Couldn't load the needed property \"{}\". Aborting.", DATAHUB_TAG_INF_URL_PROPERTY_NAME); } else { Set<String> requestResult, taggedCorpora = null; String[] body; for (int i = 0; i < neededTags.length; ++i) { try { ResponseEntity<String[]> forEntity = rt.getForEntity(taggedCorpusURL + neededTags[i], String[].class); if (forEntity.getStatusCode().equals(HttpStatus.OK)) { body = forEntity.getBody(); if (taggedCorpora == null) { taggedCorpora = Sets.newHashSet(body); LOGGER.debug("corpora with \"{}\" tag {}", neededTags[i], taggedCorpora); } else { requestResult = Sets.newHashSet(body); LOGGER.debug("corpora with \"{}\" tag {}", neededTags[i], requestResult); taggedCorpora = Sets.intersection(taggedCorpora, requestResult); } } else { LOGGER.warn("Couldn't get any datasets with the {} tag from DataHubIO. Status: ", neededTags[i], forEntity.getStatusCode()); } } catch (Exception e) { LOGGER.warn("Couldn't get any datasets with the {} tag from DataHubIO. Exception: {}", neededTags[i], e); } } } return result; } public static void main(String[] args) { DatahubNIFLoader d = new DatahubNIFLoader(); // d.init(); for (Entry<String, String> e : d.datasets.entrySet()) { LOGGER.debug("{}: {}", e.getKey(), e.getValue()); } } public Map<String, String> getDataSets() { return datasets; } }