// Nadun's Weblog: HTML Parser - Java Library to extract html Elements

HTML Parser is a very cool java library which can be used to extract values in a html page.

Let's take the URL - http://www.google.ca/finance?q=BMO

Let's say we want to extract the value of the span tag - ( Mar 18 - Close.)

Here is the code

import java.io.IOException;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import org.htmlparser.Node;

import org.htmlparser.Parser;

import org.htmlparser.filters.CssSelectorNodeFilter;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

public class test {

/**

* @param args

public static void main(String[] args) {

String URL = "http://www.google.ca/finance?q=BMO";

try {

URL url = new URL(URL);

URLConnection urlcon;

urlcon = url.openConnection();

Parser parser = new Parser(urlcon); // creating of HTML parser

// object

CssSelectorNodeFilter cssFilter = new CssSelectorNodeFilter(

"SPAN.nwp");

NodeList nodes = parser.parse(cssFilter); // getting nodes of

// span.nwp. Only NodeList created with one value

Node node = nodes.elementAt(0); //getting the first value of the nodes list

System.out.println(nodes.size());

String value = node.getFirstChild().getText(); //getting the value of the node

System.out.println(value);

} catch (MalformedURLException e) {

System.out.println("Malformed Exception :" + e.getMessage());

} catch (IOException e) {

e.printStackTrace();

} catch (ParserException e) {

e.printStackTrace();

}

// Nadun's Weblog