c# 使用正则解析html

2021-04-21 17:27

阅读：604

标签：val 返回键 lin gpo stat option any oar 之间

  1 #region 解析HTML
  2         /// 
  3         /// 获取网页标签内容
  4         /// 
  5         public static string[] RegexHtmlToFormat(string as_Html, string tags)
  6         {
  7             Liststring> list = new Liststring>();
  8             Regex regex = new Regex("" + tags + "[^>]*?>[\\s\\S]*?" + tags + ">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
  9             if (regex.IsMatch(as_Html))
 10             {
 11                 MatchCollection matchCollection = regex.Matches(as_Html);
 12                 foreach (Match match in matchCollection)
 13                 {
 14                     list.Add(match.Value);//获取到的
 15                 }
 16             }
 17             return list.ToArray();
 18         }
 19         ///    
 20         /// 取得HTML中所有图片的 URL。   
 21         ///    
 22         /// HTML代码   
 23         /// 图片的URL列表   
 24         public static string[] GetHtmlImageUrlList(string sHtmlText)
 25         {
 26             // 定义正则表达式用来匹配 img 标签   
 27             Regex regImg = new Regex(@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""‘]?[\s\t\r\n]*(?[^\s\t\r\n""‘]*)[^]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
 28 
 29             // 搜索匹配的字符串   
 30             MatchCollection matches = regImg.Matches(sHtmlText);
 31             int i = 0;
 32             string[] sUrlList = new string[matches.Count];
 33 
 34             // 取得匹配项列表   
 35             foreach (Match match in matches)
 36                 sUrlList[i++] = match.Groups["imgUrl"].Value;
 37             return sUrlList;
 38         }
 39         /// 
 40         /// 获取页面内所有漫画图片地址
 41         /// 
 42         public static string[] RegexHtmlDiv(string as_Html, string className)
 43         {
 44             Liststring> list = new Liststring>();
 45             Regex regex = new Regex("
" + className + "‘>(.|\n)*?
", RegexOptions.IgnoreCase | RegexOptions.Multiline);
 46             if (regex.IsMatch(as_Html))
 47             {
 48                 MatchCollection matchCollection = regex.Matches(as_Html);
 49                 foreach (Match match in matchCollection)
 50                 {
 51                     string ls_rc = match.Value;
 52                     list.Add(ls_rc);//获取到的
 53                 }
 54             }
 55             return list.ToArray();
 56         }
 57 
 58         /// 
 59         /// 解析HTML
 60         /// 示例代码
 61         /// 
 62         public static string RegexHTMLList(string as_Html)
 63         {
 64             as_Html = as_Html.Replace("\t", "");
 65             as_Html = as_Html.Replace(" ", "");
 66             string ls_rc = "";
 67             Regex regex = new Regex("(?.*?)
", RegexOptions.IgnoreCase | RegexOptions.Multiline);
 68             if (regex.IsMatch(as_Html))
 69             {
 70                 MatchCollection matchCollection = regex.Matches(as_Html);
 71                 foreach (Match match in matchCollection)
 72                 {
 73                     ls_rc += match.Value;//获取到的
 74                 }
 75             }
 76             return ls_rc;
 77         }
 78 
 79         ///   
 80         /// 获取字符中指定标签的值  
 81         ///   
 82         /// 字符串  
 83         /// 标签  
 84         /// 属性名  
 85         /// 属性  
 86         public static string GetTitleContent(string str, string title, string attrib)
 87         {
 88             string tmpStr = string.Format("]*?{1}=([‘\"\"]?)(?[^‘\"\"\\s>]+)\\1[^>]*>", title, attrib); //获取之间内容  
 89             Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase);
 90             string result = TitleMatch.Groups["url"].Value;
 91             return result;
 92         }
 93         /// 
 94         /// 解析控件的属性返回键值对
 95         /// 
 96         /// 
 97         /// 
 98         public static System.Collections.Hashtable getAttrs(string HtmlElement)
 99         {
100             System.Collections.Hashtable ht = new System.Collections.Hashtable();
101             MatchCollection mc = Regex.Matches(HtmlElement, "(?[\\S^=]+)\\s*=\\s*\"(?[^\"\"]+)\"|(?[\\S^=]+)\\s*=\\s*‘(?[^‘‘]+)‘|(?\\w+)=(?[^\"])(?=[\\s])");
102             foreach (Match m in mc)
103             {
104                 ht[m.Groups[1].Value] = m.Groups[2].Value;
105             }
106             return ht;
107         }
108         #endregion