Java tutorial
/* Copyright Software Engineering Research laboratory <serl@cs.wichita.edu> This program is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. GNU library General Public */ package relevantfile; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.StringReader; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; import org.w3c.dom.NodeList; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.SAXException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; /* @author: Sara Bahrami <mxbahramizanjani@wichita.edu> * Purpose: Parsing the XML files(output of SrcML.jar) * output: Corpus with the textual information for each source code file in separated rows. * */ public class XmlParser { public static HashMap<String, ArrayList<String>> Corpus = new HashMap<String, ArrayList<String>>(); //public static String dirPath = "/media/extrav/DeveloperRecommendation/Eclipse/CorpusEclipse2013-04"; // public static String Corpusfilepath="/media/extrav/Eclipse/Repository/CorpusEclipse2013-04"; public static String Corpusfilepath; // public static String xmldirPath ="/media/extrav/Eclipse/Repository/srcml/"; public static String xmldirPath; public static void main(String[] args) { xmldirPath = args[0]; Corpusfilepath = args[1]; try { File Corpusfile = new File(Corpusfilepath); BufferedWriter writer = new BufferedWriter(new FileWriter(Corpusfile)); File folder = new File(xmldirPath); File[] listOfFiles = folder.listFiles(); for (File file : listOfFiles) { if (file.length() != 0) { String filename = file.getName().replace('-', File.separatorChar); System.out.println(filename); print(writer, CreatCorpus(file, xmldirPath), filename); } } writer.flush(); writer.close(); } catch (Exception e) { e.printStackTrace(); } } /********************************************************************/ public static String CamelCase(String Str) { Str = Str.replaceAll("[\\W|\\_]", " "); Str = Str.replaceAll("((?<=[a-z])(?=[A-Z]))|((?<=[A-Z])(?=[A-Z][a-z]))", " "); return Str; } /*********************************************************************************************************************/ public static ArrayList<String> CreatCorpus(File file, String xmldirPath) throws ParserConfigurationException, SAXException, IOException { ArrayList<String> textinfo = new ArrayList<String>(); String name = file.getName(); File fXmlFile = new File(xmldirPath + File.separatorChar + name); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(fXmlFile); doc.getDocumentElement().normalize(); NodeList nList_name = doc.getElementsByTagName("name"); for (int temp = 0; temp < nList_name.getLength(); temp++) { Node nNode = nList_name.item(temp); String value; value = nNode.getFirstChild().getTextContent(); String[] value1 = CamelCase(value).split("\\s+"); for (int j = 0; j < value1.length; j++) { String infotemp = removeStopWordsAndStem(value1[j]); if (infotemp.length() > 0) { textinfo.add(infotemp); } } } NodeList nList_comment = doc.getElementsByTagName("comment"); for (int temp = 0; temp < nList_comment.getLength(); temp++) { Node nNode = nList_comment.item(temp); String value1; value1 = nNode.getFirstChild().getTextContent().trim(); if (!(value1.contains("Copyright"))) { value1 = value1.replaceAll("[\\r|\\n]", " "); value1 = value1.replaceAll("[\\*|\\/]", ""); value1 = value1.replaceAll("@author", ""); value1 = value1.trim(); value1 = value1.replaceAll(" +", " "); //System.out.println(value1); String[] str = value1.split("\\s+"); for (int j = 0; j < str.length; j++) { String infotemp = removeStopWordsAndStem(str[j]); if (infotemp.length() > 0) { textinfo.add(infotemp); } } } /*if (value1.contains("Copyright")) { value1 = value1.replaceAll("[\\r|\\n]", " "); value1 = value1.replaceAll("[\\*|\\/]", ""); value1 = value1.replaceAll("@author", ""); value1=value1.trim(); value1=value1.replaceAll(" +", " "); //System.out.println(value1); //if((value1.contains("Description"))&&(value1.contains("Contributors"))) //{ //Integer first=value1.indexOf("Description"); //Integer end=value1.indexOf("Contributors"); //value1=value1.substring(first, end); String[] str=value1.split("\\s+"); //System.out.println(value1); for(int j=0;j<str.length;j++) { //if(!textinfo.contains(str[j])) //{ textinfo.add(str[j]); //} } //} }*/ } NodeList nlListexp = doc.getElementsByTagName("expr"); for (int k = 0; k < nlListexp.getLength(); k++) { Node node = nlListexp.item(k); if (node.hasChildNodes() == true) if (node.getFirstChild().getNodeType() == Node.TEXT_NODE) { String exprvalue = node.getFirstChild().getTextContent().trim(); exprvalue = exprvalue.replaceAll("[\\r|\\n]", " "); String[] str = exprvalue.split("\\s+"); for (int j = 0; j < str.length; j++) { String infotemp = removeStopWordsAndStem(str[j]); if (infotemp.length() > 0) { textinfo.add(infotemp); } } } } return textinfo; } /******************************************************************************************/ public static void print(BufferedWriter writer, ArrayList<String> textinfo, String filename) throws Exception { writer.write(filename); writer.write("\t"); for (int i = 0; i < textinfo.size(); i++) { writer.write(removeStopWordsAndStem(textinfo.get(i)).trim()); writer.write(" "); } writer.newLine(); } /********************************************************************************************/ public static String removeStopWordsAndStem(String input) throws IOException { /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue" ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int" ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super", "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/ String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class", "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit", "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator", "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed", "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using", "virtual" }; ArrayList<String> stopWords = new ArrayList<String>(); for (int k = 0; k < stop_word.length; k++) stopWords.add(stop_word[k]); TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input)); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StopFilter.makeStopSet(Version.LUCENE_46, stopWords)); tokenStream = new PorterStemFilter(tokenStream); StringBuilder sb = new StringBuilder(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(token.toString()); } tokenStream.end(); tokenStream.close(); return sb.toString(); } }