c# 使用正则解析html
标签:val 返回键 lin gpo stat option any oar 之间
1 #region 解析HTML
2 ///
3 /// 获取网页标签内容
4 ///
5 public static string[] RegexHtmlToFormat(string as_Html, string tags)
6 {
7 Liststring> list = new Liststring>();
8 Regex regex = new Regex("" + tags + "[^>]*?>[\\s\\S]*?" + tags + ">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
9 if (regex.IsMatch(as_Html))
10 {
11 MatchCollection matchCollection = regex.Matches(as_Html);
12 foreach (Match match in matchCollection)
13 {
14 list.Add(match.Value);//获取到的
15 }
16 }
17 return list.ToArray();
18 }
19 ///
20 /// 取得HTML中所有图片的 URL。
21 ///
22 /// HTML代码
23 /// 图片的URL列表
24 public static string[] GetHtmlImageUrlList(string sHtmlText)
25 {
26 // 定义正则表达式用来匹配 img 标签
27 Regex regImg = new Regex(@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""‘]?[\s\t\r\n]*(?[^\s\t\r\n""‘]*)[^]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
28
29 // 搜索匹配的字符串
30 MatchCollection matches = regImg.Matches(sHtmlText);
31 int i = 0;
32 string[] sUrlList = new string[matches.Count];
33
34 // 取得匹配项列表
35 foreach (Match match in matches)
36 sUrlList[i++] = match.Groups["imgUrl"].Value;
37 return sUrlList;
38 }
39 ///
40 /// 获取页面内所有漫画图片地址
41 ///
42 public static string[] RegexHtmlDiv(string as_Html, string className)
43 {
44 Liststring> list = new Liststring>();
45 Regex regex = new Regex("
" + className + "‘>(.|\n)*?
", RegexOptions.IgnoreCase | RegexOptions.Multiline);
46 if (regex.IsMatch(as_Html))
47 {
48 MatchCollection matchCollection = regex.Matches(as_Html);
49 foreach (Match match in matchCollection)
50 {
51 string ls_rc = match.Value;
52 list.Add(ls_rc);//获取到的
53 }
54 }
55 return list.ToArray();
56 }
57
58 ///
59 /// 解析HTML
60 /// 示例代码
61 ///
62 public static string RegexHTMLList(string as_Html)
63 {
64 as_Html = as_Html.Replace("\t", "");
65 as_Html = as_Html.Replace(" ", "");
66 string ls_rc = "";
67 Regex regex = new Regex("(?.*?)
", RegexOptions.IgnoreCase | RegexOptions.Multiline);
68 if (regex.IsMatch(as_Html))
69 {
70 MatchCollection matchCollection = regex.Matches(as_Html);
71 foreach (Match match in matchCollection)
72 {
73 ls_rc += match.Value;//获取到的
74 }
75 }
76 return ls_rc;
77 }
78
79 ///
80 /// 获取字符中指定标签的值
81 ///
82 /// 字符串
83 /// 标签
84 /// 属性名
85 /// 属性
86 public static string GetTitleContent(string str, string title, string attrib)
87 {
88 string tmpStr = string.Format("]*?{1}=([‘\"\"]?)(?[^‘\"\"\\s>]+)\\1[^>]*>", title, attrib); //获取之间内容
89 Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase);
90 string result = TitleMatch.Groups["url"].Value;
91 return result;
92 }
93 ///
94 /// 解析控件的属性返回键值对
95 ///
96 ///
97 ///
98 public static System.Collections.Hashtable getAttrs(string HtmlElement)
99 {
100 System.Collections.Hashtable ht = new System.Collections.Hashtable();
101 MatchCollection mc = Regex.Matches(HtmlElement, "(?[\\S^=]+)\\s*=\\s*\"(?[^\"\"]+)\"|(?[\\S^=]+)\\s*=\\s*‘(?[^‘‘]+)‘|(?\\w+)=(?[^\"])(?=[\\s])");
102 foreach (Match m in mc)
103 {
104 ht[m.Groups[1].Value] = m.Groups[2].Value;
105 }
106 return ht;
107 }
108 #endregion
c# 使用正则解析html
标签:val 返回键 lin gpo stat option any oar 之间
原文地址:https://www.cnblogs.com/cxyzhangjie/p/8266786.html
评论