声明:此正则表达式只适用于.net ,使用的流程为发送http请求返回整个html网页,然后从此html页面抓取想要的数据。
第一部分:发送httpWebRequest 请求
C#代码
//url 地址 HttpWebRequest request = (HttpWebRequest)WebRequest.Create("URL")); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //浏览器类型设置 request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)"; StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("UTF-8")); //返回的html网页数据 String htmlStr = reader.ReadToEnd();
登录后复制
第二部分:根据返回的html获取有用数据,此方法适用于所有想通过ID或Class等等的标签找到html的需求,拿下面一个方法为例
C#代码
////// 获得颜色 /// /// /// public String getColor(String htmlStr) { //获取class为 DetailsC_Sku的html ,还可改为ID的方式 //string regstr6 = @"[w]+)[^>]*s[iI][dD]=(?"; string regstr6 = @"[w]+)[^>]*s[cC][lL][aA][sS][sS]=(?"; string regstr7 = "["']?)DetailsC_Sku(?(Quote)"; string regstr8 = @"k)"; string regstr9 = "["']?[^>]*>"; string regstr10 = @"((?[^>]*>)|k>(?)|.*?)*k>"; StringBuilder sb2 = new StringBuilder(); sb2.Append(regstr6); sb2.Append(regstr7); sb2.Append(regstr8); sb2.Append(regstr9); sb2.Append(regstr10); //根据正则表达式获取的html String sizeHtml = Regex.Match(htmlStr, sb2.ToString(), RegexOptions.Singleline).ToString(); if (!String.IsNullOrEmpty(sizeHtml)) { String newhtml = htmlStr.Replace(sizeHtml, ""); string regstr11 = @"[w]+)[^>]*s[cC][lL][aA][sS][sS]=(?"; string regstr12 = "["']?)DetailsC_Sku(?(Quote)"; string regstr13 = @"k)"; string regstr14 = "["']?[^>]*>"; string regstr15 = @"((?[^>]*>)|k>(?)|.*?)*k>"; StringBuilder sb3 = new StringBuilder(); sb3.Append(regstr11); sb3.Append(regstr12); sb3.Append(regstr13); sb3.Append(regstr14); sb3.Append(regstr15); String colorHtml = Regex.Match(newhtml, sb3.ToString(), RegexOptions.Singleline).ToString(); if (String.IsNullOrEmpty(colorHtml)) return ""; //找出此colorHtml中的所有a 标签 Regex regex2 = new Regex(@"[sS]*?"); MatchCollection mc2 = regex2.Matches(colorHtml); StringBuilder sbs = new StringBuilder(); //循环找到颜色 if (mc2.Count > 0) { foreach (Match mm in mc2) { sbs.Append(RemoveHtml(mm.Value.ToString())).Append(","); } } return sbs.ToString(); } return ""; }
登录后复制
C#代码
////// 替换字符串中的html标签为空返回标签里的内容 /// /// /// public string RemoveHtml(string src) { Regex htmlReg = new Regex(@"]+>", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex htmlSpaceReg = new Regex("\ \;", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex spaceReg = new Regex("\s{2,}|\ \;", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex styleReg = new Regex(@"", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex scriptReg = new Regex(@"<script(.*?)", RegexOptions.Compiled | RegexOptions.IgnoreCase); src = styleReg.Replace(src, string.Empty); src = scriptReg.Replace(src, string.Empty); src = htmlReg.Replace(src, string.Empty); src = htmlSpaceReg.Replace(src, " "); src = spaceReg.Replace(src, " "); return src.Trim(); }
登录后复制
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至253000106@qq.com举报,一经查实,本站将立刻删除。
发布者:PHP中文网,转转请注明出处:https://www.chuangxiangniao.com/p/2554266.html