C#.Net基于正则表达式抓取百度百家文章列表的方法示例

作者:袖梨 2022-06-25

工作之余,学习了一下正则表达式,鉴于实践是检验真理的唯一标准,于是便写了一个利用正则表达式抓取百度百家文章的例子,具体过程请看下面源码:

一、获取百度百家网页内容

publicList GetUrl()
{
  try
  {
    stringurl ="http://baijia.baidu.com/";
    WebRequest webRequest = WebRequest.Create(url);
    WebResponse webResponse = webRequest.GetResponse();
    StreamReader reader =newStreamReader(webResponse.GetResponseStream());
    stringresult = reader.ReadToEnd();
    reader.Close();
    webResponse.Close();
    returnAnalysisHtml(result);
  }
  catch(Exception ex)
  {
    throwex;
  }
}

二、通过正则表达式筛选

publicList AnalysisHtml(stringhtmlContent)
{
  List list =newList();
  stringstrPattern ="

(?[^<]+)</a></h3>.*s*<ps*class="feeds-item-text">(?<Abstract>[^<]+)<as*href="(?<Url>.*)"s*target="_blank"s*class="feeds-item-more"s*mon=".*s*">.*s*</a></p>"; Regex regex =newRegex(strPattern, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.CultureInvariant); if(regex.IsMatch(htmlContent)) { MatchCollection matchCollection = regex.Matches(htmlContent); foreach(Match matchinmatchCollection) { string[] str =newstring[3]; str[0] = match.Groups[1].Value;//获取到的是列表数据的标题 str[1] = match.Groups[2].Value;//获取到的是内容 str[2] = match.Groups[3].Value;//获取到的是链接到的地址 list.Add(str); } } returnlist; }</pre> </div> </div> </div> </div> </section> <section class="wrap-box"> <div class="g-tit"> <h2>相关文章</h2> </div> <ul class="s-list nobord notop"> <li> <a href="/art-424845.htm" class="s-card"> <div class="s-card-l"> <p class="tit">《鸣潮》槲生半岛下棋获胜方法</p> <div class="info"> <span class="person">游戏攻略</span> <span class="time">2025-01-15</span> </div> </div> <div class="s-card-pic"> <img src="/images/lazy.gif" data-src="/uploads/20250115/logo_67870aacdfecf1.jpg" alt="《鸣潮》槲生半岛下棋获胜方法" /> </div> </a> </li> <li> <a href="/art-424843.htm" class="s-card"> <div class="s-card-l"> <p class="tit">《燕云十六声》积矩九剑流派介绍</p> <div class="info"> <span class="person">游戏攻略</span> <span class="time">2025-01-15</span> </div> </div> <div class="s-card-pic"> <img src="/images/lazy.gif" data-src="/uploads/20250115/logo_67870aa9d20ce1.png" alt="《燕云十六声》积矩九剑流派介绍" /> </div> </a> </li> <li> <a href="/art-424844.htm" class="s-card"> <div class="s-card-l"> <p class="tit">《忍者必须死3》兑换码2025年一月</p> <div class="info"> <span class="person">游戏攻略</span> <span class="time">2025-01-15</span> </div> </div> <div class="s-card-pic"> <img src="/images/lazy.gif" data-src="/uploads/20250115/logo_67870aac3d5d21.jpg" alt="《忍者必须死3》兑换码2025年一月" /> </div> </a> </li> <li> <a href="/art-424842.htm" class="s-card"> <div class="s-card-l"> <p class="tit">《鬼谷八荒》修为一直是0解决方法</p> <div class="info"> <span class="person">游戏攻略</span> <span class="time">2025-01-15</span> </div> </div> <div class="s-card-pic"> <img src="/images/lazy.gif" data-src="/uploads/20250115/logo_67870aa8355051.jpg" alt="《鬼谷八荒》修为一直是0解决方法" /> </div> </a> </li> <li> <a href="/art-424841.htm" class="s-card"> <div class="s-card-l"> <p class="tit">《宝可梦大集结》密勒顿技能介绍</p> <div class="info"> <span class="person">游戏攻略</span> <span class="time">2025-01-15</span> </div> </div> <div class="s-card-pic"> <img src="/images/lazy.gif" data-src="/uploads/20250115/logo_67870aa6cb0031.jpg" alt="《宝可梦大集结》密勒顿技能介绍" /> </div> </a> </li> <li> <a href="/art-424840.htm" class="s-card"> <div class="s-card-l"> <p class="tit">以下哪种鲸喷出的水柱是双股的</p> <div class="info"> <span class="person">游戏攻略</span> <span class="time">2025-01-15</span> </div> </div> <div class="s-card-pic"> <img src="/images/lazy.gif" data-src="/uploads/20250115/logo_67870aa460b301.png" alt="以下哪种鲸喷出的水柱是双股的" /> </div> </a> </li> </ul> </section> <section class="wrap-box"> <div class="g-tit"> <h2>精彩推荐</h2> </div> <ul class="card-box"> <li class="card3"> <a href="/app/103536.htm" target="_self" class="figure"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870c76e47e21.png" alt="真赵云无双九游 安卓最新版v1.6.6" /> </div> <p class="figure-head">真赵云无双九游 安卓最新版v1.6.6</p> <span class="figure-btn">下载</span> </a> </li> <li class="card3"> <a href="/app/103503.htm" target="_self" class="figure"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870c0b974041.png" alt="战魂铭人oppo版 安卓版v2.7.0" /> </div> <p class="figure-head">战魂铭人oppo版 安卓版v2.7.0</p> <span class="figure-btn">下载</span> </a> </li> <li class="card3"> <a href="/app/103475.htm" target="_self" class="figure"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870ba5c7c7d1.png" alt="战魂铭人九游版 最新版v2.7.0" /> </div> <p class="figure-head">战魂铭人九游版 最新版v2.7.0</p> <span class="figure-btn">下载</span> </a> </li> <li class="card3"> <a href="/app/103452.htm" target="_self" class="figure"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250114/logo_6785c315c66831.png" alt="星球重启云游戏官方正版 安卓版v1.2.42" /> </div> <p class="figure-head">星球重启云游戏官方正版 安卓版v1.2.42</p> <span class="figure-btn">下载</span> </a> </li> </ul> <ul class="card-box-b"> <li class="card10"> <a href="/app/103595.htm" target="_self" class="figure2"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870d13d3f971.png" alt="梦想家园汉化版 最新版v1.3.0" /> </div> <div class="figure-cont"> <p class="figure-head">梦想家园汉化版 最新版v1.3.0</p> <div class="figure-desc"> <span>模拟经营</span> <span>梦想家园汉化版 最新版v1.3.0</span> </div> <div class="figure-desc"> <p>梦想家园汉化安卓版是一款以泡泡玛特为主题,玩法独特的模拟经营</p> </div> </div> <span class="figure-btn">下载</span> </a> </li> <li class="card10"> <a href="/app/103593.htm" target="_self" class="figure2"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870d0c30af11.png" alt="服从我 (Obey Me!)安卓版v8.1.11" /> </div> <div class="figure-cont"> <p class="figure-head">服从我 (Obey Me!)安卓版v8.1.11</p> <div class="figure-desc"> <span>模拟经营</span> <span>服从我 (Obey Me!)安卓版v8.1.11</span> </div> <div class="figure-desc"> <p>服从我(obey me)是一款让你陷入ikemen恶魔们深情</p> </div> </div> <span class="figure-btn">下载</span> </a> </li> <li class="card10"> <a href="/app/103589.htm" target="_self" class="figure2"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870d03d3d3f1.png" alt="佩皮超级商店 免费版v1.13.1" /> </div> <div class="figure-cont"> <p class="figure-head">佩皮超级商店 免费版v1.13.1</p> <div class="figure-desc"> <span>模拟经营</span> <span>佩皮超级商店 免费版v1.13.1</span> </div> <div class="figure-desc"> <p>佩皮超级商店(Pepi Super Stores)是一款经营</p> </div> </div> <span class="figure-btn">下载</span> </a> </li> <li class="card10"> <a href="/app/103585.htm" target="_self" class="figure2"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870cfd3d3691.png" alt="船舶墓地模拟器内置菜单最新版本 v142" /> </div> <div class="figure-cont"> <p class="figure-head">船舶墓地模拟器内置菜单最新版本 v142</p> <div class="figure-desc"> <span>模拟经营</span> <span>船舶墓地模拟器内置菜单最新版本 v142</span> </div> <div class="figure-desc"> <p>船舶墓地模拟器内置菜单版是一款模拟经营类游戏,玩家们将在这里</p> </div> </div> <span class="figure-btn">下载</span> </a> </li> <li class="card10"> <a href="/app/103584.htm" target="_self" class="figure2"> <div class="figure-box"> <img src="/images/lazy.gif" data-src="https://img.111cn.net/uploads/20250115/logo_67870cfa705231.png" alt="铠甲勇士捕将变身器模拟器 最新版v1.5" /> </div> <div class="figure-cont"> <p class="figure-head">铠甲勇士捕将变身器模拟器 最新版v1.5</p> <div class="figure-desc"> <span>模拟经营</span> <span>铠甲勇士捕将变身器模拟器 最新版v1.5</span> </div> <div class="figure-desc"> <p>铠甲勇士捕将变身器模拟器是一款有着丰富选择的腰带召唤器,该召</p> </div> </div> <span class="figure-btn">下载</span> </a> </li> </ul> </section> <footer class="foot"> <a href="/" class="logo-icon"> <img src="/mobile/images/logo2.png" alt="一聚教程网"> </a> <p>Copyright © 2010-2022</p> <p>111cn.net All Rights Reserved</p> </footer> <script> var advData = {"img_fixed_pc_adv":"https:\/\/img.111cn.net\/uploads\/20240509\/663c2e9729f58.jpg","img_fixed_mob_adv":"https:\/\/img.111cn.net\/uploads\/20240509\/663c2e8793225.jpg","url_adv":"http:\/\/shop.hushen.cn\/shop\/c\/baojianpin.html","str_adv":"\u864e\u795e\u5546\u57ce\uff1a\u5173\u7231\u7537\u6027\uff0c\u66f4\u61c2\u7537\u4eba\u3002\u89e3\u51b3\u5927\u4f17\u7684\u7537\u8a00\u4e4b\u9690","img_popup_adv":"https:\/\/img.111cn.net\/uploads\/20240509\/663c2e748238d.png","pc_show_img":"2","pc_show_popup":"2","pc_show_video":"2","mob_show_img":"2","mob_show_popup":"2","mob_show_video":"2","close_adv":"https:\/\/img.111cn.net\/uploads\/20240508\/663b20650801e.png","video_adv":"\/pc\/images\/pc-adv.mp4"}; </script> <script src="/jspc/funcmob.js" type="text/javascript"></script> <!-- Google tag (gtag.js) --> <script async src="https://www.googletagmanager.com/gtag/js?id=G-DSRRGRV1TL"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'G-DSRRGRV1TL'); </script> <div class="back-top" style="display: block;"> <span class="icon-box"> <svg class="icon" viewBox="0 0 1024 1024"> <path d="M213.333333 640h170.666667v256h256v-256h170.666667l-298.666667-341.333333zM170.666667 128h682.666666v85.333333H170.666667z" fill="#0374f3"></path> </svg> </span> </div> </div> <script src="/js/stat.js"></script> </body> </html>