12/16/2012

Screen-Scraping Stock Market Data with HtmlAgilityPack

I’ve been involved with the stock market since the early 80’s when I was a rookie broker with Merrill Lynch in Orlando. In fact probably the main reason I got out of stock trading and into programming was because I spent more time looking at the technical indicators than selling stocks and mutual funds!

Back in the 1970’s legendary market guru Norman Fosback developed a “Daily Market Forecast” index that had an uncanny accuracy with predicting the direction of the next day’s market. Most of the items I have in this article were included in that index.

There is a page on Yahoo Finance, http://finance.yahoo.com/advances  which provides most of these indicators daily. Another page on the WSJ, http://online.wsj.com/mdc/public/page/2_3021-tradingdiary2.html  provides the daily TICK and TRIN (Arms index) that are also components of Fosback’s indicator.

I put together a console application that can be run daily via Task Scheduler to scrape these values daily and store them in a SQL Server database table. I use HtmlAgilityPack, which basically converts an HTML document into an XPath-compliant XML document to get the values, and Dapper to perform the SQL insert / updates.

Here’s how it works:

using System; 
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using HtmlAgilityPack;
using System.Net;
using Dapper;
using System.Configuration ;

namespace DJMarketData
{
class Program
{
static void Main(string[] args)
{
string tickPage = "http://online.wsj.com/mdc/public/page/2_3021-tradingdiary2.html";
string targetUrl = "http://finance.yahoo.com/advances";

// create a new HtmlDocument object
HtmlAgilityPack.HtmlDocument doc = new HtmlDocument();
// load the html page via WebClient
WebClient wc = new WebClient();
byte[] b = wc.DownloadData(targetUrl);
MemoryStream ms = new MemoryStream(b);
wc.Dispose();
doc.Load(ms);
//Get the HtmlNodeCollection of TD elements that have our values
var stuff = doc.DocumentNode.SelectNodes("//td[@align='right']");

string val = "";
string name = "";
Dictionary<String, double> items = new Dictionary<string, double>();

int ctr = 0;
// iterate over our nodelist and get each value; assign the correct fieldname based on the
// FieldNames enum
foreach (var nod in stuff)
{
var prevNode = nod.PreviousSibling;
if(prevNode!=null)
name = prevNode.InnerText;
val = nod.ChildNodes[0].InnerText;
var fieldName = Enum.GetName(typeof(FieldNames), ctr);
if (val.Length > 13)
val = val.Substring(0, 13);
else
val = val.TrimEnd();

items.Add( fieldName, double.Parse(val));

Console.WriteLine(fieldName + " : " + val);
// if we hit the last item we can stop processing
if (fieldName == "BBTOTV") break;
ctr++;
}

// Now get the Tick and Trin from the WSJ page
HtmlAgilityPack.HtmlDocument doc2 = new HtmlDocument();
WebClient wc2 = new WebClient();
byte[] b2 = wc.DownloadData(tickPage );
MemoryStream ms2 = new MemoryStream(b2);
doc.Load(ms2);

var ticks = doc.DocumentNode.SelectNodes("//td[@class='text']").Where(x=>x.InnerText=="Closing tick").ToList();
var arms = doc.DocumentNode.SelectNodes("//td[@class='text']").Where(x => x.InnerText.Contains("Closing Arms")).ToList();

var NYSETICK = int.Parse(ticks[0].NextSibling.NextSibling.InnerText);
var NYSETRIN = double.Parse(arms[0].NextSibling.NextSibling.InnerText);
// Create a TradeData object to hold all our values
TradeData data = new TradeData()
{
TRADEDATE = DateTime.Today,
NYSEADV = (int) items["NYSEADV"],
AMEXADV = (int)items["AMEXADV"],
NASDADV = (int)items["NASDADV"],
BBADV = (int)items["BBADV"],
NYSEDEC = (int)items["NYSEDEC"],
AMEXDEC = (int)items["AMEXDEC"],
NASDDEC = (int)items["NASDDEC"],
BBDEC = (int)items["BBDEC"],
NYSETOT = (int)items["NYSETOT"],
AMEXTOT = (int)items["AMEXTOT"],
NASDTOT = (int)items["NASDTOT"],
BBTOT = (int)items["BBTOT"],
NYSENH = (int)items["NYSENH"],
AMEXNH = (int)items["AMEXNH"],
NASDNH = (int)items["NASDNH"],
BBNH = (int)items["BBNH"],
NYSENL = (int)items["NYSENL"],
AMEXNL = (int)items["AMEXNL"],
NASDNL = (int)items["NASDNL"],
BBNL = (int)items["BBNL"],
NYSEUV = (int)items["NYSEUV"],
AMEXUV = (int)items["AMEXUV"],
NASDUV = (int)items["NASDUV"],
BBUV = (int)items["BBUV"],
NYSEDV = (int)items["NYSEDV"],
AMEXDV = (int)items["AMEXDV"],
NASDDV = (int)items["NASDDV"],
BBDV = (int)items["BBDV"],
NYSEUNV = (int)items["NYSEUNV"],
AMEXUNV = (int)items["AMEXUNV"],
NASDUNV = (int)items["NASDUNV"],
BBUNV = (int)items["BBUNV"],
NYSETOTV = (int)items["NYSETOTV"],
AMEXTOTV = (int)items["AMEXTOTV"],
NASDTOTV = (int)items["NASDTOTV"],
BBTOTV = (int)items["BBTOTV"],
NYSETICK = NYSETICK,
NYSETRIN = NYSETRIN

};

// convert the TradeData instance to a set of DynamicParameters
DynamicParameters parms = SqlMapperUtil.GetParametersFromObject(data, null);
// Perform the SQL insert / update
SqlMapperUtil.InsertUpdateOrDeleteStoredProc("InsertMarketData", parms, "local");
// DONE!

}

public enum FieldNames
{
NYSEADV,
AMEXADV,
NASDADV,
BBADV,
NYSEDEC,
AMEXDEC,
NASDDEC,
BBDEC,
NYSETOT,
AMEXTOT,
NASDTOT,
BBTOT,
NYSENH,
AMEXNH,
NASDNH,
BBNH,
NYSENL,
AMEXNL,
NASDNL,
BBNL,
NYSEUV,
AMEXUV,
NASDUV,
BBUV,
NYSEDV,
AMEXDV,
NASDDV,
BBDV,
NYSEUNV,
AMEXUNV,
NASDUNV,
BBUNV,
NYSETOTV,
AMEXTOTV,
NASDTOTV,
BBTOTV
}

}
}


There is also a class to hold the TradeData for each day’s scraped values:



using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace DJMarketData
{
public class TradeData
{
public DateTime TRADEDATE {get;set;}
public int NYSEADV {get;set;}
public int AMEXADV {get;set;}
public int NASDADV {get;set;}
public int BBADV {get;set;}
public int NYSEDEC {get;set;}
public int AMEXDEC {get;set;}
public int NASDDEC {get;set;}
public int BBDEC {get;set;}
public int NYSETOT {get;set;}
public int AMEXTOT {get;set;}
public int NASDTOT {get;set;}
public int BBTOT {get;set;}
public int NYSENH {get;set;}
public int AMEXNH {get;set;}
public int NASDNH {get;set;}
public int BBNH {get;set;}
public int NYSENL {get;set;}
public int AMEXNL {get;set;}
public int NASDNL {get;set;}
public int BBNL {get;set;}
public int NYSEUV {get;set;}
public int AMEXUV {get;set;}
public int NASDUV {get;set;}
public int BBUV {get;set;}
public int NYSEDV {get;set;}
public int AMEXDV {get;set;}
public int NASDDV {get;set;}
public int BBDV {get;set;}
public int NYSEUNV {get;set;}
public int AMEXUNV {get;set;}
public int NASDUNV {get;set;}
public int BBUNV {get;set;}
public int NYSETOTV {get;set;}
public int AMEXTOTV {get;set;}
public int NASDTOTV {get;set;}
public int BBTOTV {get;set;}
public int NYSETICK {get;set;}
public double NYSETRIN {get;set;}
}
}


The downloadable Visual Studio 2012 solution contains a SQL Script to create the SQL Server table and stored procedure that match this code.



Download Source Code