c# 使用正则解析html

2021-04-21 17:27

阅读:503

标签:val   返回键   lin   gpo   stat   option   any   oar   之间   

  1 #region 解析HTML
  2         /// 
  3         /// 获取网页标签内容
  4         /// 
  5         public static string[] RegexHtmlToFormat(string as_Html, string tags)
  6         {
  7             Liststring> list = new Liststring>();
  8             Regex regex = new Regex("" + tags + "[^>]*?>[\\s\\S]*?" + tags + ">", RegexOptions.IgnoreCase | RegexOptions.Multiline);
  9             if (regex.IsMatch(as_Html))
 10             {
 11                 MatchCollection matchCollection = regex.Matches(as_Html);
 12                 foreach (Match match in matchCollection)
 13                 {
 14                     list.Add(match.Value);//获取到的
 15                 }
 16             }
 17             return list.ToArray();
 18         }
 19         ///    
 20         /// 取得HTML中所有图片的 URL。   
 21         ///    
 22         /// HTML代码   
 23         /// 图片的URL列表   
 24         public static string[] GetHtmlImageUrlList(string sHtmlText)
 25         {
 26             // 定义正则表达式用来匹配 img 标签   
 27             Regex regImg = new Regex(@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""‘]?[\s\t\r\n]*(?[^\s\t\r\n""‘]*)[^]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
 28 
 29             // 搜索匹配的字符串   
 30             MatchCollection matches = regImg.Matches(sHtmlText);
 31             int i = 0;
 32             string[] sUrlList = new string[matches.Count];
 33 
 34             // 取得匹配项列表   
 35             foreach (Match match in matches)
 36                 sUrlList[i++] = match.Groups["imgUrl"].Value;
 37             return sUrlList;
 38         }
 39         /// 
 40         /// 获取页面内所有漫画图片地址
 41         /// 
 42         public static string[] RegexHtmlDiv(string as_Html, string className)
 43         {
 44             Liststring> list = new Liststring>();
 45             Regex regex = new Regex("
" + className + "‘>(.|\n)*?
", RegexOptions.IgnoreCase | RegexOptions.Multiline); 46 if (regex.IsMatch(as_Html)) 47 { 48 MatchCollection matchCollection = regex.Matches(as_Html); 49 foreach (Match match in matchCollection) 50 { 51 string ls_rc = match.Value; 52 list.Add(ls_rc);//获取到的 53 } 54 } 55 return list.ToArray(); 56 } 57 58 /// 59 /// 解析HTML 60 /// 示例代码 61 /// 62 public static string RegexHTMLList(string as_Html) 63 { 64 as_Html = as_Html.Replace("\t", ""); 65 as_Html = as_Html.Replace(" ", ""); 66 string ls_rc = ""; 67 Regex regex = new Regex("(?.*?) ", RegexOptions.IgnoreCase | RegexOptions.Multiline); 68 if (regex.IsMatch(as_Html)) 69 { 70 MatchCollection matchCollection = regex.Matches(as_Html); 71 foreach (Match match in matchCollection) 72 { 73 ls_rc += match.Value;//获取到的 74 } 75 } 76 return ls_rc; 77 } 78 79 /// 80 /// 获取字符中指定标签的值 81 /// 82 /// 字符串 83 /// 标签 84 /// 属性名 85 /// 属性 86 public static string GetTitleContent(string str, string title, string attrib) 87 { 88 string tmpStr = string.Format("]*?{1}=([‘\"\"]?)(?[^‘\"\"\\s>]+)\\1[^>]*>", title, attrib); //获取之间内容 89 Match TitleMatch = Regex.Match(str, tmpStr, RegexOptions.IgnoreCase); 90 string result = TitleMatch.Groups["url"].Value; 91 return result; 92 } 93 /// 94 /// 解析控件的属性返回键值对 95 /// 96 /// 97 /// 98 public static System.Collections.Hashtable getAttrs(string HtmlElement) 99 { 100 System.Collections.Hashtable ht = new System.Collections.Hashtable(); 101 MatchCollection mc = Regex.Matches(HtmlElement, "(?[\\S^=]+)\\s*=\\s*\"(?[^\"\"]+)\"|(?[\\S^=]+)\\s*=\\s*‘(?[^‘‘]+)‘|(?\\w+)=(?[^\"])(?=[\\s])"); 102 foreach (Match m in mc) 103 { 104 ht[m.Groups[1].Value] = m.Groups[2].Value; 105 } 106 return ht; 107 } 108 #endregion

 

c# 使用正则解析html

标签:val   返回键   lin   gpo   stat   option   any   oar   之间   

原文地址:https://www.cnblogs.com/cxyzhangjie/p/8266786.html


评论


亲,登录后才可以留言!