[轉貼] 利用正規表達式(Regular Expression)取得網頁內容的超連結Url

2012040316:18

出處:http://www.dotblogs.com.tw/puma/archive/2008/06/30/4410.aspx


 

最近有人討論到這個問題...小弟去找了一個Regular Expression,,還不錯用..

雖然不一定很正確的取出Url,但命中率我覺得很高了...

就用範例來介紹,分享給大家呀..

asp.net(c#)

Default.aspx

<%@ Page Language="C#" AutoEventWireup="true" ValidateRequest="false" CodeBehind="Default.aspx.cs" Inherits="Test._Default" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" >
<head id="Head1" runat="server">
    <title>利用正規表達式(Regular Expression)取得網頁內容的超連結Url</title>
</head>

<body>
    <form id="form1" runat="server">
    <div>
        URL:
        <asp:TextBox ID="TextBox1" runat="server" Width="340px">http://tw.yahoo.com</asp:TextBox>
        <asp:Button ID="Button1" runat="server" OnClick="Button1_Click" Text="GetHtmlUrl" /><br />
        <asp:GridView ID="GridView1" runat="server"></asp:GridView>
    </div>
    </form>
</body>
</html>

Default.aspx.cs

using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Net;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using System.Text;

namespace Test
{
    public partial class _Default : System.Web.UI.Page
    {
        protected void Page_Load(object sender, EventArgs e)
        {
        }

        protected void Button1_Click(object sender, EventArgs e)
        {
            WebClient client = new WebClient();
            client.Encoding = Encoding.UTF8;
            string htmlCode = client.DownloadString(this.TextBox1.Text);

            //(?<HTML><a[^>]*href\s*=\s*[\"\']?(?<HRef>[^"'>\s]*)[\"\']?[^>]*>(?<Title>[^<]+|.*?)?</a\s*>)
            string regPattern = @"(?<HTML><a[^>]*href\s*=\s*[\""\']?(?<HRef>[^""'>\s]*)[\""\']?[^>]*>(?<Title>[^<]+|.*?)?</a\s*>)";
            MatchCollection mc = Regex.Matches(htmlCode, regPattern);

            List<Href> list = new List<Href>();

            foreach (Match match in mc)
            {
                list.Add(new Href(match.Groups[1].Value, match.Groups[2].Value, match.Groups[3].Value));
            }

            this.GridView1.DataSource = list;
            this.GridView1.DataBind();
        }
    }

    public class Href
    {
        private string _Tag;
        private string _Url;
        private string _Title;

        public Href(string Tag, string Url, string Title)
        {
            _Tag = Tag;
            _Url = Url;
            _Title = Title;
        }

        public string Tag
        {
            set { _Tag = value; }
            get { return _Tag; }
        }

        public string Url
        {
            set { _Url = value; }
            get { return _Url; }
        }

        public string Title
        {
            set { _Title = value; }
            get { return _Title; }
        }
    }
}

執行結果:

 

參考網址:

http://regexlib.com/RETester.aspx?regexp_id=984
http://forums.msdn.microsoft.com/zh-TW/regexp/thread/c85fc09a-0760-4a4b-b980-168c2acda953/