ProcessRSSFile.javaPage 1 of 3
import java.io.*;
import java.util.*;
public class ExtractHTMLPages
{
public static String inputLine, outputFile, id, link;
public static String separator = System.getProperty("file.separator");
public static String newLine = System.getProperty("line.separator");
public static String saveDirectory;
public static String rssFile;
public static StringBuffer description;
public static ArrayList idList;
public static ArrayList linkList;
public static ArrayList descriptionList;
public static int start, stop;
public static void processRSSFile() throws Exception
{
try
{
BufferedReader br = new BufferedReader(new FileReader(rssFile));
while ((inputLine = br.readLine()) != null)
{
if (inputLine.indexOf("</channel>") >= 0) break;
}
while ((inputLine = br.readLine()) != null)
{
if (inputLine.indexOf("<item rdf:about=\"") >= 0)
{
start = inputLine.indexOf("<item rdf:about=\"") + 17;
stop = inputLine.indexOf("\"", start);
linkList.add(inputLine.substring(start, stop));
}
if (inputLine.indexOf("cml:molecule id=\"") >= 0)
{
start = inputLine.indexOf("id=\"") + 4;
stop = inputLine.indexOf("\"", start);
idList.add(inputLine.substring(start, stop));
}
if (inputLine.indexOf("<description>") >= 0)
{
inputLine = br.readLine();
if (inputLine.indexOf("<![CDATA[") >= 0)
{
description = new StringBuffer();
inputLine = br.readLine();
while ((inputLine.indexOf("]]>") < 0))
{
description.append(inputLine.trim());
description.append(newLine);
inputLine = br.readLine();
}
descriptionList.add(description.toString());
}
}
}
br.close();
}
catch (Exception e)
{
throw e;
}
}
public static void saveHTMLIndexFile() throws Exception
{
try
{
FileWriter fw = new FileWriter(saveDirectory + separator + "UsefulChemistryMolecules.html");
fw.write("<html>" + newLine);
fw.write("<head>" + newLine);
fw.write("<title>Useful Chemistry Molecules</title>" + newLine);
fw.write(newLine + "<script type=\"text/javascript\">" + newLine);
fw.write("var UCNumber;" + newLine);
fw.write("var UCNumberHTMLFile;" + newLine);
fw.write("function getHTMLFile()" + newLine);
fw.write("{" + newLine);
fw.write(" UCNumber = document.forms[0].elements[0].value;" + newLine);
fw.write(" UCNumberHTMLFile = UCNumber + \".html\";" + newLine);
fw.write(" document.getElementById(\"link\").innerHTML = \"<a href=\\\"\" + UCNumberHTMLFile + \"\\\">View \" + UCNumberHTMLFile + \" in full window.</a>\";" + newLine);
fw.write(" document.getElementById(\"description\").src = UCNumberHTMLFile;" + newLine);
fw.write("}" + newLine + newLine);
fw.write("</script>" + newLine);
fw.write("</head>" + newLine);
fw.write("<body onLoad=\"getHTMLFile()\">" + newLine);
fw.write("<h2>Useful Chemistry Molecules</h2>" + newLine);
fw.write("<hr/>" + newLine);
fw.write("<table cellspacing=\"2\" cellpadding=\"1\">" + newLine);
fw.write("<tr<td nowrap<a href=\" Spreadsheet of Useful Chemistry Molecules</a</td</tr>" + newLine);
fw.write("<tr<td nowrap<a href=\" File of Useful Chemistry Molecules</a</td</tr>" + newLine);
fw.write("<tr<td nowrap<a href=\" Chemistry Molecules SD File</a</td</tr>" + newLine);
fw.write("<tr<td nowrap<a href=\" Chemistry Molecules New Items Page</a</td</tr>" + newLine);
fw.write("<tr<td nowrap<a href=\" Chemistry Molecules Blog</a</td</tr>" + newLine);
fw.write("</table>" + newLine);
fw.write("<hr/>" + newLine);
fw.write("<form name=\"UCNumberSelectionForm\">" + newLine);
fw.write("<select name=\"UCNumber\" onChange=\"getHTMLFile()\">" + newLine);
for (int i = 0;i < idList.size();i++)
{
id = (String) idList.get(i);
fw.write("<option value=\"" + id + "\">" + id + "</option>" + newLine);
}
fw.write("</select Select the UC # to view the item." + newLine);
fw.write("</form>" + newLine);
fw.write("<div id=\"link\"</div>" + newLine);
fw.write("<hr/>" + newLine);
fw.write("<iframe id=\"description\" frameborder=\"0\" height=\"100%\" width=\"100%\"</iframe>" + newLine);
fw.write("</body>" + newLine);
fw.write("</html>" + newLine);
fw.flush();
fw.close();
}
catch (Exception e)
{
throw e;
}
}
public static void saveHTMLFiles() throws Exception
{
try
{
FileWriter fw;
String HTMLFile;
String XLSFile;
String TEXTFile;
for (int i = 0;i < idList.size();i++)
{
id = (String) idList.get(i);
HTMLFile = saveDirectory + separator + id + ".html";
XLSFile = id + ".xls";
TEXTFile = id + ".txt";
fw = new FileWriter(saveDirectory + separator + id + ".html");
link = (String) linkList.get(i);
fw.write("<html>" + newLine);
fw.write("<title>" + id + " From Useful Chemistry Molecules</title>" + newLine);
fw.write("</head>" + newLine + newLine);
fw.write("<body>" + newLine);
fw.write("<table cellspacing=\"2\" cellpadding=\"1\">" + newLine);
fw.write("<tr<td nowrap<a href=\"" + link + "\">View " + id + " UsefulChem Blog Entry</a</td</tr>" + newLine);
fw.write("<tr<td nowrap<a href=\"" + XLSFile + "\">Excel Version of this information</a</td</tr>" + newLine);
fw.write("<tr<td nowrap<a href=\"" + TEXTFile + "\">Text Version of this information</a</td</tr>" + newLine);
fw.write("</table>" + newLine);
fw.write("<br/>Click on the image to view this molecule in Jmol:<br/>" + newLine);
fw.write((String) descriptionList.get(i) + newLine);
fw.write("</body>" + newLine);
fw.write("</html>" + newLine);
fw.flush();
fw.close();
}
}
catch (Exception e)
{
throw e;
}
}
public static void main(String[] args)
{
if (args.length < 2)
{
System.out.println("Usage: java ExtractHTMLPages <save directory> <RSS File>");
System.exit(0);
}
try
{
saveDirectory = args[0];
rssFile = args[1];
System.out.print("Extracting HTML pages from " + rssFile + " ... ");
idList = new ArrayList();
linkList = new ArrayList();
descriptionList = new ArrayList();
File dd = new File(saveDirectory);
if (!dd.isDirectory()) dd.mkdir();
processRSSFile();
saveHTMLIndexFile();
saveHTMLFiles();
System.out.println("[ok]");
}
catch (Exception e)
{
System.out.println("[failed] " + e.getMessage());
}
}
}