ProcessRSSFile.javaPage 1 of 3

import java.io.*;

import java.util.*;

public class ExtractHTMLPages

{

public static String inputLine, outputFile, id, link;

public static String separator = System.getProperty("file.separator");

public static String newLine = System.getProperty("line.separator");

public static String saveDirectory;

public static String rssFile;

public static StringBuffer description;

public static ArrayList idList;

public static ArrayList linkList;

public static ArrayList descriptionList;

public static int start, stop;

public static void processRSSFile() throws Exception

{

try

{

BufferedReader br = new BufferedReader(new FileReader(rssFile));

while ((inputLine = br.readLine()) != null)

{

if (inputLine.indexOf("</channel>") >= 0) break;

}

while ((inputLine = br.readLine()) != null)

{

if (inputLine.indexOf("<item rdf:about=\"") >= 0)

{

start = inputLine.indexOf("<item rdf:about=\"") + 17;

stop = inputLine.indexOf("\"", start);

linkList.add(inputLine.substring(start, stop));

}

if (inputLine.indexOf("cml:molecule id=\"") >= 0)

{

start = inputLine.indexOf("id=\"") + 4;

stop = inputLine.indexOf("\"", start);

idList.add(inputLine.substring(start, stop));

}

if (inputLine.indexOf("<description>") >= 0)

{

inputLine = br.readLine();

if (inputLine.indexOf("<![CDATA[") >= 0)

{

description = new StringBuffer();

inputLine = br.readLine();

while ((inputLine.indexOf("]]>") < 0))

{

description.append(inputLine.trim());

description.append(newLine);

inputLine = br.readLine();

}

descriptionList.add(description.toString());

}

}

}

br.close();

}

catch (Exception e)

{

throw e;

}

}

public static void saveHTMLIndexFile() throws Exception

{

try

{

FileWriter fw = new FileWriter(saveDirectory + separator + "UsefulChemistryMolecules.html");

fw.write("<html>" + newLine);

fw.write("<head>" + newLine);

fw.write("<title>Useful Chemistry Molecules</title>" + newLine);

fw.write(newLine + "<script type=\"text/javascript\">" + newLine);

fw.write("var UCNumber;" + newLine);

fw.write("var UCNumberHTMLFile;" + newLine);

fw.write("function getHTMLFile()" + newLine);

fw.write("{" + newLine);

fw.write(" UCNumber = document.forms[0].elements[0].value;" + newLine);

fw.write(" UCNumberHTMLFile = UCNumber + \".html\";" + newLine);

fw.write(" document.getElementById(\"link\").innerHTML = \"<a href=\\\"\" + UCNumberHTMLFile + \"\\\">View \" + UCNumberHTMLFile + \" in full window.</a>\";" + newLine);

fw.write(" document.getElementById(\"description\").src = UCNumberHTMLFile;" + newLine);

fw.write("}" + newLine + newLine);

fw.write("</script>" + newLine);

fw.write("</head>" + newLine);

fw.write("<body onLoad=\"getHTMLFile()\">" + newLine);

fw.write("<h2>Useful Chemistry Molecules</h2>" + newLine);

fw.write("<hr/>" + newLine);

fw.write("<table cellspacing=\"2\" cellpadding=\"1\">" + newLine);

fw.write("<tr<td nowrap<a href=\" Spreadsheet of Useful Chemistry Molecules</a</td</tr>" + newLine);

fw.write("<tr<td nowrap<a href=\" File of Useful Chemistry Molecules</a</td</tr>" + newLine);

fw.write("<tr<td nowrap<a href=\" Chemistry Molecules SD File</a</td</tr>" + newLine);

fw.write("<tr<td nowrap<a href=\" Chemistry Molecules New Items Page</a</td</tr>" + newLine);

fw.write("<tr<td nowrap<a href=\" Chemistry Molecules Blog</a</td</tr>" + newLine);

fw.write("</table>" + newLine);

fw.write("<hr/>" + newLine);

fw.write("<form name=\"UCNumberSelectionForm\">" + newLine);

fw.write("<select name=\"UCNumber\" onChange=\"getHTMLFile()\">" + newLine);

for (int i = 0;i < idList.size();i++)

{

id = (String) idList.get(i);

fw.write("<option value=\"" + id + "\">" + id + "</option>" + newLine);

}

fw.write("</select&nbsp;Select the UC # to view the item." + newLine);

fw.write("</form>" + newLine);

fw.write("<div id=\"link\"</div>" + newLine);

fw.write("<hr/>" + newLine);

fw.write("<iframe id=\"description\" frameborder=\"0\" height=\"100%\" width=\"100%\"</iframe>" + newLine);

fw.write("</body>" + newLine);

fw.write("</html>" + newLine);

fw.flush();

fw.close();

}

catch (Exception e)

{

throw e;

}

}

public static void saveHTMLFiles() throws Exception

{

try

{

FileWriter fw;

String HTMLFile;

String XLSFile;

String TEXTFile;

for (int i = 0;i < idList.size();i++)

{

id = (String) idList.get(i);

HTMLFile = saveDirectory + separator + id + ".html";

XLSFile = id + ".xls";

TEXTFile = id + ".txt";

fw = new FileWriter(saveDirectory + separator + id + ".html");

link = (String) linkList.get(i);

fw.write("<html>" + newLine);

fw.write("<title>" + id + " From Useful Chemistry Molecules</title>" + newLine);

fw.write("</head>" + newLine + newLine);

fw.write("<body>" + newLine);

fw.write("<table cellspacing=\"2\" cellpadding=\"1\">" + newLine);

fw.write("<tr<td nowrap<a href=\"" + link + "\">View " + id + " UsefulChem Blog Entry</a</td</tr>" + newLine);

fw.write("<tr<td nowrap<a href=\"" + XLSFile + "\">Excel Version of this information</a</td</tr>" + newLine);

fw.write("<tr<td nowrap<a href=\"" + TEXTFile + "\">Text Version of this information</a</td</tr>" + newLine);

fw.write("</table>" + newLine);

fw.write("<br/>Click on the image to view this molecule in Jmol:<br/>" + newLine);

fw.write((String) descriptionList.get(i) + newLine);

fw.write("</body>" + newLine);

fw.write("</html>" + newLine);

fw.flush();

fw.close();

}

}

catch (Exception e)

{

throw e;

}

}

public static void main(String[] args)

{

if (args.length < 2)

{

System.out.println("Usage: java ExtractHTMLPages <save directory> <RSS File>");

System.exit(0);

}

try

{

saveDirectory = args[0];

rssFile = args[1];

System.out.print("Extracting HTML pages from " + rssFile + " ... ");

idList = new ArrayList();

linkList = new ArrayList();

descriptionList = new ArrayList();

File dd = new File(saveDirectory);

if (!dd.isDirectory()) dd.mkdir();

processRSSFile();

saveHTMLIndexFile();

saveHTMLFiles();

System.out.println("[ok]");

}

catch (Exception e)

{

System.out.println("[failed] " + e.getMessage());

}

}

}