eu.riscoss.dataproviders.providers.FossologyDataProvider.java Source code

Java tutorial

Introduction

Here is the source code for eu.riscoss.dataproviders.providers.FossologyDataProvider.java

Source

package eu.riscoss.dataproviders.providers;

/**
 * @author Mirko Morandini, Fabio Mancinelli
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import javax.sound.midi.SysexMessage;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import eu.riscoss.dataproviders.common.IndicatorsMap;

public class FossologyDataProvider implements AbstractDataProvider {

    private static final boolean VERBOSE = false; //output of all licenses and filterings

    /* Properties needed by the data collector */
    private static final String TARGET_FOSSOLOGY_PROPERTY = "targetFossology";
    private static final String FOSSOLOGY_SCANTYPE_PROPERTY = "fossologyScanType";
    private static final String LICENSE_FILE_PROPERTY = "licenseFile";
    private static final String TARGET_FOSSOLOGY_TXT_PROPERTY = "targetFossologyList";
    private static final String FOSSOLOGY_FILTER_EXTENSIONS = "fossologyFilterExtensions"; //true (default) / false
    private static final String FOSSOLOGY_ACCEPTED_EXTENSIONS = "fossologyAcceptedExtensions"; //comma-separated list of file extensions, e.g. java, cpp, jj, js, jsp, php, py

    /* String used to check when a file doens't have a license associated */
    private static final String NO_LICENSE_FOUND = "No_license_found";

    /* These are the license types defined in licenseType.properties */
    private static final String PERMISSIVE_LICENSE_TYPE = "permissive";
    private static final String COPYLEFT_LICENSE_TYPE = "copyleft";
    private static final String COPYLEFT_WITH_LINKING_LICENSE_TYPE = "copyleft-with-linking";
    private static final String COMMERCIAL_LICENSE_TYPE = "commercial";
    private static final String UNKNOWN_LICENSE_TYPE = "_unknown_";

    String[] acceptedExtensions = new String[0];

    /**
     * 
     * @param targetFosslolgy
     * @param licenseFile
     * @throws Exception 
     */

    public void createIndicators(IndicatorsMap im, Properties properties) throws Exception {
        //private static void createIndicatorsFromFossologyMeasures(String targetFosslolgy, String licenseFile) throws IOException {

        final String licenseFile = properties.getProperty(LICENSE_FILE_PROPERTY);
        if (licenseFile == null) {
            throw new Exception(String.format("%s property not speficied", LICENSE_FILE_PROPERTY));
        }

        String scanType = properties.getProperty(FOSSOLOGY_SCANTYPE_PROPERTY); //"overview" or "filelist"
        if (scanType == null)
            scanType = "overview"; //default value as it was the only one in prior versions

        this.acceptedExtensions = properties.getProperty(FOSSOLOGY_ACCEPTED_EXTENSIONS, "").split(",");

        if (properties.getProperty(FOSSOLOGY_FILTER_EXTENSIONS, "true") == "false") {
            this.acceptedExtensions = new String[0]; //empty the extensions list --> behaviour: accept all extensions 
        }
        System.out.println(this.acceptedExtensions);

        HashMap<String, Collection<String>> licensesMap = parseLicensesFile(licenseFile);

        HashMap<String, Integer> licenseBuckets;
        if (scanType.equals("filelist")) {
            String targetFossologyTxt = properties.getProperty(TARGET_FOSSOLOGY_TXT_PROPERTY);
            if (targetFossologyTxt == null) {
                throw new Exception(String.format("%s property not speficied", TARGET_FOSSOLOGY_TXT_PROPERTY));
            }
            licenseBuckets = analyseFileList(targetFossologyTxt, licensesMap);
        } else { //"overview"
            String targetFossology = properties.getProperty(TARGET_FOSSOLOGY_PROPERTY);
            if (targetFossology == null) {
                throw new Exception(String.format("%s property not speficied", TARGET_FOSSOLOGY_PROPERTY));
            }
            licenseBuckets = analyseOverviewReport(targetFossology, licensesMap);
        }

        //add all measures to the IndicatorsMap (= Risk Data)
        boolean addAll = false;
        if (addAll)
            for (String licenseBucket : licenseBuckets.keySet()) {
                im.add("Measure_Fossology." + licenseBucket, licenseBuckets.get(licenseBucket));
            }

        float total = licenseBuckets.get("_sum_"); //to make sure that the result of the division is a float //number of files
        Integer licenseCount = licenseBuckets.get("_count_"); //number of licenses found
        Integer numPermissive = licenseBuckets.get("Permissive License");
        Integer numCopyleft = licenseBuckets.get("FSF Copyleft");
        Integer numNoLicense = licenseBuckets.get("No License");
        Integer numUnknown = licenseBuckets.get("_unknown_");
        Integer numLinkingPermitted = licenseBuckets.get("FSF linking permitted");
        Integer numCommercial = licenseBuckets.get("Commercial license");
        Integer numPublicDomain = licenseBuckets.get("Public domain");
        Integer numMultiplyLicensed = licenseBuckets.get("_num_multiply_licensed_files_");

        im.add("number-of-different-licenses", licenseCount); //Number of (different?) component licenses
        im.add("percentage-of-files-without-license", numNoLicense / total); //% of files without license (Fossology)
        im.add("files-with-unknown-license", numUnknown / total); //% of files with unclear/unknown license (Fossology)
        im.add("copyleft-licenses", numCopyleft / total); //% of licenses: viral (Fossology)
        im.add("copyleft-licenses-with-linking", numLinkingPermitted / total); //% of licenses: library viral (Fossology)
        im.add("percentage-of-files-with-permissive-license", numPermissive / total); //% of licenses: without constraints (Fossology)
        im.add("files-with-commercial-license", numCommercial / total); //% of licenses: commercial (Fossology)
        im.add("percentage-of-files-with-public-domain-license", numPublicDomain / total);
        im.add("percentage-of-files-with-multiple-license", numMultiplyLicensed / total);
        //TODO
        im.add("files-with-ads-required-liceses", 0);

        //       i93b" label="Amount of OSS code integrated"
        //       i93c" label="Technique used for integrating code (static/dynamic linking, copy)"
        //       i93d" label="Type of licenses in core components"
        //       i93h" label="Amount of component code imported/linked from other OSS projects"
        //       i120" label="Percentage of US code"

        //System.out.println(IndicatorsMap.get().toString());
    }

    /**
     * Parses a LicensesCfg file
     * @param target
     * @return HashMap: License Types, each with a Collection of Licenses
     * @throws IOException
     */
    protected static HashMap<String, Collection<String>> parseLicensesFile(String target) throws IOException {
        HashMap<String, Collection<String>> result = new HashMap<String, Collection<String>>();
        Document document;
        if (target.startsWith("http")) {
            document = Jsoup.connect(target).get();
        } else {
            File file = new File(target);
            System.out.println("Fossology config file used: " + file.getCanonicalPath());
            document = Jsoup.parse(file, "UTF-8", "http://localhost");
        }

        //        System.out.println(document.outerHtml());

        Elements licensesLinks = document.getElementsByAttribute("id");

        for (Element element : licensesLinks) {
            String licenseName = element.child(0).text();
            if (element.children().size() > 1) {
                String s = element.child(1).text();
                Collection<String> licensesList = Arrays.asList(s.split("\\s*\\|\\s*")); //("\\s*\\|\\s*"));

                //xDebug            System.out.println("Analysed license type: "+licenseName+": "+licensesList);
                result.put(licenseName, licensesList);
            }
        }

        return result;
    }

    /**
     * Analyses a fossology html file
     * @param target
     * @param licensesMap
     * @return
     * @throws IOException
     */
    private HashMap<String, Integer> analyseOverviewReport(String target,
            HashMap<String, Collection<String>> licensesMap) throws IOException {
        //private static HashMap<String, Integer> analyseFossologyReport(String target, String licenseFile) throws IOException {
        //        List<String> result = new ArrayList<String>();
        Document document;

        if (target.startsWith("http")) {
            document = Jsoup.connect(target).get();
        } else {
            File file = new File(target);
            document = Jsoup.parse(file, "UTF-8", "http://localhost");
        }

        Element table = document.select("table[id=lichistogram]").first();
        Elements rows = table.select("tr");

        List<LicenseEntry> llist = new ArrayList<LicenseEntry>(); //list of licenses in the fossology file

        //for each license, parses the name (0) and the number of occurrences (2) and saves it as a LicenseEntry
        for (Element element : rows) {
            Elements col = element.select("td");

            if (col.size() != 0) {
                int c = Integer.parseInt(col.get(0).ownText());//num of occurrences
                String lic = col.get(2).text();
                llist.add(new LicenseEntry(c, lic));
                //mlist.put(lic, c);
            }
            //           System.out.println(col.get(1).ownText());
            //           Element count=col.get(0);
        }

        //get license type buckets

        HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>();
        int total = 0;

        Set<String> licenseTypes = licensesMap.keySet();
        //initialize with 0 to avoid missing types
        for (String licensetype : licenseTypes) {
            licenseBuckets.put(licensetype, 0);
        }

        boolean matched = false;
        int numUnknown = 0;
        for (LicenseEntry le : llist) {
            for (String licenseType : licenseTypes) {//cycles on license types from config file
                if (le.matchesOneOf(licensesMap.get(licenseType), licenseType)) {
                    Integer currentcount = licenseBuckets.get(le.licensetype);
                    if (currentcount == null) //for safety, but should be initialised
                        currentcount = 0;
                    licenseBuckets.put(le.licensetype, currentcount + le.count);
                    matched = true;
                }
            }
            total += le.count;
            if (matched == false) { //unknown
                numUnknown += le.count;
                System.out.println("Unknown license: " + le.getName());
            }
        }

        licenseBuckets.put("_unknown_", numUnknown);
        licenseBuckets.put("_sum_", total);
        licenseBuckets.put("_count_", llist.size());

        System.out.println("\nLicense Buckets Fossology from HTML overview scanning:");
        System.out.println(licenseBuckets);

        //        for (String license : result) {
        //            System.out.format("%s\n", license);
        //        }
        return licenseBuckets;
    }

    public boolean matchesOneOf(Collection<String> si, String license) {
        for (String string : si) {
            //         if (name.contains(string))
            if (license.startsWith(string)) {
                return true;
            }
        }
        return false;
    }

    private String getLicenseTypeForLicense(HashMap<String, Collection<String>> licensesMap, String license) {
        for (String l : licensesMap.keySet()) {
            //if (license.toLowerCase().contains(l.toLowerCase())) {
            //attention: order matters in the file! (e.g. to parse GPL/LGPL correctly)
            if (matchesOneOf(licensesMap.get(l), license))
                return l;
        }
        //DEBUG - add missing licenses to licenses file!
        System.err.println("WARNING: getLicenseTypeForLicense() #Unknown license: " + license
                + ". Try to add it to LicensesCfg.");
        return UNKNOWN_LICENSE_TYPE;
    }

    /**
    * Parses a Fossology-generated License txt file with list of files and licenses. Example row: 
    * SAT4J 2.3.3/SAT4J 2.3/Sat4J-2.3.3/plugin.properties: EPL-1.0 ,LGPL-2.1+
    * @param targetFossology path+filename (http or local)
    * @param licensesMap 
    * @return
     * @throws IOException 
     * @throws ClientProtocolException 
    */
    private HashMap<String, Integer> analyseFileList(String targetFossology,
            HashMap<String, Collection<String>> licensesMap) throws ClientProtocolException, IOException {

        //LicenseAnalysisReport licenseAnalysisReport; TODO use this one
        BufferedReader br = null;
        HttpEntity entity = null;
        CloseableHttpResponse response = null;

        int totalFiles = 0;
        int numMultiplyLicensedFiles = 0;
        int numAdditionalLicenseDefinitions = 0;
        String line;
        int i = 0;
        Map<String, Integer> licenseOccurrences = new HashMap<String, Integer>();
        boolean onlyXLinesDisplayed = false;

        try {
            //open text file with list of files and licenses
            if (targetFossology.toLowerCase().startsWith("http")) {
                CloseableHttpClient httpClient = HttpClients.createDefault();
                HttpGet get = new HttpGet(targetFossology);
                response = httpClient.execute(get);

                entity = response.getEntity();
                if (entity != null) {
                    InputStream is = entity.getContent();
                    br = new BufferedReader(new InputStreamReader(entity.getContent()));

                    //EntityUtils.consume(entity); //release all resources held by the httpEntity
                }
                //response.close();
            } else { //local file

                br = new BufferedReader(new InputStreamReader(new FileInputStream(targetFossology)));
            }

            /* Calculate the occurrences for each license type */

            while ((line = br.readLine()) != null) {
                //DEBUG 
                //System.out.println(i++ +" "+line);
                /* Parse only the lines that contains a ':' */
                if (line.contains("Warning: Only the last")) {
                    System.out.println();
                    System.err.println("WARNING: " + line);
                    System.out.println();
                    onlyXLinesDisplayed = true;
                    break;
                }

                if (line.contains(":")) {
                    String[] parts = line.split(":", 2);
                    if (parts.length > 1) {

                        if (acceptedExtension(parts[0].trim())) {
                            String licenseString = parts[1].trim();
                            String[] licenses = licenseString.split(","); //multiple licenses possible

                            for (String license : licenses) {
                                if (licenseOccurrences.get(license) == null) {
                                    licenseOccurrences.put(license, 1);
                                } else {
                                    licenseOccurrences.put(license, licenseOccurrences.get(license) + 1);
                                }
                            }
                            if (VERBOSE) {
                                for (String l : licenses) {
                                    System.out.print(l + "  ");
                                }
                                System.out.println();
                            }
                            totalFiles++;
                            numAdditionalLicenseDefinitions += licenses.length - 1;//0 if single license
                            numMultiplyLicensedFiles += licenses.length <= 1 ? 0 : 1; //0 if single license
                        }
                    }
                }
            }
        } finally {
            if (entity != null) { //http
                EntityUtils.consume(entity); //release all resources held by the httpEntity
                response.close();
            }
            br.close(); //also if local
        }
        HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>();
        //TODO: switch from licenseBuckets to the use of licenseAnalysisReport
        //licenseAnalysisReport.totalFiles = totalFiles;

        licenseBuckets.put("_sum_", totalFiles); //num of files

        licenseBuckets.put("_num_multiply_licensed_files_", numMultiplyLicensedFiles);
        licenseBuckets.put("_num_additional_licenses_", numAdditionalLicenseDefinitions);

        //licenseAnalysisReport.numberOfLicenses = licenseOccurrences.keySet().size();
        licenseBuckets.put("_count_", licenseOccurrences.keySet().size());
        if (licenseOccurrences.get(NO_LICENSE_FOUND) != null) {
            /* Removes the NO_LICENSE_FOUND pseudolicense from the number of licenses found. */
            /* UnclassifiedLicense pseudolicense remains still included */
            //licenseAnalysisReport.numberOfLicenses--;
            licenseBuckets.put("_count_", licenseOccurrences.keySet().size() - 1);
        }

        //initializes with 0 to avoid missing types
        licenseBuckets.put("_unknown_", 0);
        for (String licensetype : licensesMap.keySet()) {
            licenseBuckets.put(licensetype, 0);
        }

        /* Find license types and sum their occurrences*/
        for (String license : licenseOccurrences.keySet()) {
            //if (!license.equals(NO_LICENSE_FOUND)) { //MM: commented... also this pseudolicense is used for bucketing 
            String licenseType = getLicenseTypeForLicense(licensesMap, license); //UNKNOWN_LICENSE_TYPE _unknown_ if none matches

            if (licenseBuckets.get(licenseType) == null) {
                licenseBuckets.put(licenseType, licenseOccurrences.get(license));
            } else {
                licenseBuckets.put(licenseType, licenseBuckets.get(licenseType) + licenseOccurrences.get(license));
            }
            //}
        }

        System.out.println("\nLicense Buckets Fossology from TXT filelist scanning:");
        System.out.println(licenseBuckets);

        //        for (String license : result) {
        //            System.out.format("%s\n", license);
        //        }
        return licenseBuckets;

    }

    private boolean acceptedExtension(String filePathString) {

        //acceptedExtensions = {"java", "cpp", "jj", "js", "jsp", "php", "py",....}; //use this.acceptedExtensions
        if (acceptedExtensions.equals(""))
            return true; //default: all extensions
        String[] filePath = filePathString.trim().split("/");
        String fileNameString = filePath[filePath.length - 1];

        int dot = fileNameString.lastIndexOf('.');
        String extension = (dot == -1) ? "" : fileNameString.substring(dot + 1).toLowerCase(); //empty string if '.' is the last char

        if (VERBOSE) {
            System.out.print("." + extension);
        }
        for (String accext : this.acceptedExtensions) {
            if (accext.trim().equalsIgnoreCase(extension)) {
                if (VERBOSE) {
                    System.out.println();
                }
                return true;
            }
        }
        if (VERBOSE)
            System.out.println(" - filtered.");

        final String[] knownNonCode = { "txt", "xml", "xslt", "xsd", "xsl", "xul", "xed", "xmi", "wsdl", "owl",
                "html", "xhtml", "htm", "properties", "prefs", "test", "pom", "project", "dtd", "css", "scss",
                "ttf", "diff", "license", "ico", "png", "gif", "jpg", "pspimage", "psd", "doc", "sh", "bat", "ods",
                "odp", "rdf", "manifest", "cat", "zip", "vm", "mf", "old", "bak", "ini", "cfg", "conf", "config",
                "def", "inf", "lst", "sql", "json", "wsdl", "class", "classpath", "type", "less", "md5", "sha1",
                "" }; //vm: velocity
        if (!Arrays.asList(knownNonCode).contains(extension))
            System.err.println("INFO: unknown extension: " + extension);

        return false;
    }

}

// from the legal risk model xml:
//      <indicator id="i103a" label="Number of component licenses (Fossology)" datatype="integer"/>
//      <indicator id="i103b" label="Amount of files without license (Fossology)" datatype="real"/>
//      <indicator id="i103c" label="Amount of files with unclear/unknown license (Fossology)" datatype="real"/>
//      <indicator id="i91" label="Amount of licenses: viral (Fossology)" datatype="real"/>
//      <indicator id="i93f" label="Amount of licenses: library viral (Fossology)" datatype="real"/>
//      <indicator id="i93g" label="Amount of licenses: without constraints (Fossology)" datatype="real"/>
//      <indicator id="i93b" label="Amount of OSS code integrated" datatype="real"/>
//      <indicator id="i93c" label="Technique used for integrating code (static/dynamic linking, copy)" datatype="integer"/>
//      <indicator id="i93d" label="Type of licenses in core components" datatype="String"/>
//      <indicator id="i93h" label="Amount of component code imported/linked from other OSS projects" datatype="real"/>
//      <indicator id="i120" label="Percentage of US code" datatype="integer"/>

//<situation id="s1" label="License virality" threshold="0.5"/> <!-- i91, i93f, i93g, i93c-->
//<situation id="s2" label="License compatibility" threshold="0.5"/> <!-- i93d, i103c -->
//<situation id="s3" label="License uncertainty" threshold="0.5"/> <!-- i103b, i103c, -i93g, -->
//<situation id="s4" label="Code problematicity" threshold="0.5"/> <!-- i103a, i93b, i93h -->
//<situation id="s5" label="Availability and verifiability of information on ownership and quality assurance" threshold="0.5"/> <!-- i93d, i93h -->
//<situation id="s6" label="Percentage of US code" threshold="10"/> <!-- i120 -->