打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
发一个解析HTML的代码.目前只能解析table与div....

代码挺简单的..但是解析的代码一定要配对出现。否则出现错误我不官。至少用来解析baidu搜索结果是没问题的
有志于写解析器的同学可以拿去玩玩

C# code
public class SimpleHtmlParser { /// <summary> /// 解析函数 /// </summary> /// <param name="s">解析字符串</param> /// <param name="elements">解析后的控件列表</param> /// <returns>返回控件树</returns> public static Element ParseHtml(string s,out List<Element> elements) { elements = new List<Element>(); elements.Clear(); Stack<Element> es = new Stack<Element>(); string pattern = @"(?=(</?table.*?>)|(</?div/?.*?>))"; RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Singleline; Regex regex = new Regex(pattern, options); MatchCollection matches = regex.Matches(s); var element = new Element(); var lastElement = element; foreach (Match match in matches) { var wordindex = 0; var wordlength = 0; var word = ""; for (int i = 0; i < match.Groups.Count; i++) { var t = match.Groups[i]; if (t.Length > 0) { wordindex = t.Index; wordlength = t.Length; word = t.Value; break; } } if (wordlength <= 0) continue; if (word == "<div/>") continue; bool isTable = word.IndexOf("table") >= 0; bool isDiv = word.IndexOf("div") >= 0; bool isEnd = word.IndexOf("</") >= 0; if (!isEnd) { //新标签 Element ee; if (isDiv) { ee = new DivElement(); } else if (isTable) { ee = new TableElement(); } else { ee = new Element(); } ee.StartTagIndex = wordindex; ee.StartTagLength = wordlength; ee.BegTag = word; //设定父级 ee.Parent = lastElement; lastElement = ee; ee.Parent.Children.Add(ee); //进栈 es.Push(ee); } else { //闭合标签 var t = es.Pop(); t.EndTag = word; t.EndIndex = wordindex; t.EndTagLength = wordlength; lastElement = t.Parent; t.OuterHtml = s.Substring(t.StartTagIndex, (t.EndIndex - t.StartTagIndex) + t.EndTagLength); t.InnerHtml = s.Substring(t.StartTagIndex + t.StartTagLength, (t.EndIndex - t.StartTagIndex-t.StartTagLength)); elements.Add(t); } } return element; } //去除代码中无用的标签 public static string ReplaceFontSpan(string s) { Regex r = new Regex("<head>.*?</head>"); s = r.Replace(s, ""); r = new Regex("</?font.*?>"); s = r.Replace(s, ""); r = new Regex("</?span.*?>"); s = r.Replace(s, ""); r = new Regex("</?a.*?>"); s = r.Replace(s, ""); return s; } //下载网页源文件 public static string DownLoadHtml(string url) { try { HttpWebRequest r = (HttpWebRequest)WebRequest.Create(url); r.Method = "get"; HttpWebResponse rep = (HttpWebResponse)r.GetResponse(); Stream receiveStream = rep.GetResponseStream(); StreamReader readStream = new StreamReader(receiveStream, System.Text.Encoding.Default); var result = readStream.ReadToEnd(); return result.ToString(); } catch { return ""; } } } public class Element : StringElement { public int StartTagIndex { get; set; } public int StartTagLength {get;set;} public int EndIndex { get; set; } public int EndTagLength { get; set; } public string BegTag { get; set; } public string EndTag {get;set;} public List<Element> Children = new List<Element>(); public Element Parent { get; set; } } public class DivElement : Element { } public class TableElement : Element { } public class TrElement : Element { } public class StringElement { public string OuterHtml { get; set; } public string InnerHtml { get; set; } }


调用代码
C# code
private void Form1_Load(object sender, EventArgs e) { var url = "http://www.baidu.com/s?wd=惠阳妇科病医院&rsv_bp=0&rsv_spt=3&inputT=21000"; var s = SimpleHtmlParser.DownLoadHtml(url); //去除无用标签 s = SimpleHtmlParser.ReplaceFontSpan(s); List<Element> t = null; var element = SimpleHtmlParser.ParseHtml(s, out t); //查找没有子控件的结果 foreach (var o in t) { if (o.Children.Count <= 0) { MessageBox.Show(o.OuterHtml); } } //遍历控件树 List(element); } public void List(Element e) { if (e.Children.Count > 0) { foreach (var t in e.Children) { List(t); } } MessageBox.Show(e.OuterHtml); }

本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
【热】打开小程序,算一算2024你的财运
玩转动态编译:四、封装
.NET Core一行代码导入导出Excel生成Word
C#操作XML方法集合
WinForm 自动更新程序(一)
java
正则表达式:matches
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服