asp教程.net 火车票源信息抓取系统
一、系统功能:
1.每隔一定时间从网络抓取一次最新的票源信息;
2.支持根据关键字筛选票源信息;
3.支持抓取时间间隔设置;
4.支持票源网址链接;
二、运行环境:
1.net2.0框架及以上;
2.ie6.0及以上;
三、实现思路:
1.设置抓取的地址与解析的方式
public static list
getdefaultsites()
{
listsites = new list ();
sites = new list();
sites.add(new site()
{
name = "火车票网",
url = "http://www.huochepiao.com/city/search.asp?leixing=%d7%aa%c8%c3&chufa=&daoda=",
regexpattern = @"· (.*?)",
encoding = encoding.default,
keys = new string[] { "卧" }
});
sites.add(new site()
{
name = "百姓网",
url = "http://beijing.baixing.com/huochepiao/?%e5%8f%91%e8%bd%a6%e6%97%a5%e6%9c%9f=&%e8%bd%a6%e6%ac%a1=&%e5%87%ba%e5%8f%91%e5%9f%8e%e5%b8%82=%e5%8c%97%e4%ba%ac&%e5%88%b0%e8%be%be%e5%9f%8e%e5%b8%82=&wanted=1",
regexpattern = @""" >(.*?)",
encoding = encoding.utf8,
domain = "http://beijing.baixing.com/",
keys = new string[] { "卧" }
});
sites.add(new site()
{
name = "赶集网",
url = "http://bj.ganji.com/piao/",
regexpattern = @"(.*?) ",
encoding = encoding.utf8,
domain = "http://bj.ganji.com/",
keys = new string[] { "卧" }
});sites.add(new site()
{
name = "酷讯网",
url = "http://huoche.kuxun.cn/zhuanrang-beijing-wuhan.html",
regexpattern = @"(.*?)",
encoding = encoding.utf8,
domain = "",
ischange = "yes"
});return sites;
}抓取网页信息
public string getnetstring(string url, encoding codetpye)
{
string str = "";
try
{
webclient client = new webclient();
byte[] pagedata = client.downloaddata(url);
str = codetpye.getstring(pagedata);
}
catch
{
}
return str ;
}
解析票源信息public class clsnetinfoparseserver
{
private static ilistlslist = new list ();
public void clearls()
{
lslist = new list();
}
private bool ishas(string url)
{
foreach (var item in lslist)
{
if (item.url == url)
{
return true;
}
}
return false;
}
public ilistdonetinfoparse(string strnetinfo, site site, string[] keys)
{
ilistlist = new list ();
matchcollection mc = regex.matches(strnetinfo, site.regexpattern);
foreach (match m in mc)
{
if (m.success)
{
getresult r = new getresult();
if (!string.isnullorempty(site.ischange))
{
r.content = site.domain + m.groups教程[1].value.trim();
r.url = m.groups[2].value.trim();
}
else
{
r.url = site.domain + m.groups[1].value.trim();
r.content = m.groups[2].value.trim();
}
if (!ishas( r.url))
{
bool iscontainkey = false;
if (keys != null && keys.length > 0)
{
foreach (string key in keys)
{
if (r.content.contains(key))
{
iscontainkey = true;
break;
}
}
}
else
{
iscontainkey = true;
}
if (!iscontainkey)
continue;
r.getdatetime = datetime.now.tostring();
r.name = site.name;
lslist.add(r);
list.add(r);
}
}
}
return list;
}
}相关文章
精彩推荐