using
System;
using
System.Xml;
using
System.Text;
using
System.Net;
using
System.IO;
using
System.Collections;
using
System.Text.RegularExpressions;
public
class
App
{
public
static
void
Main()
{
string
strCode;
ArrayList alLinks;
Console.Write(
"请输入一个网页地址:"
);
string
strURL = Console.ReadLine();
{
}
Console.WriteLine(
"正在获取页面代码,请稍侯..."
);
strCode = GetPageSource(strURL);
Console.WriteLine(
"正在提取超链接,请稍侯..."
);
alLinks = GetHyperLinks(strCode);
Console.WriteLine(
"正在写入文件,请稍侯..."
);
WriteToXml(strURL,alLinks);
}
// 获取指定网页的HTML代码
static
string
GetPageSource(
string
URL)
{
Uri uri =
new
Uri(URL);
HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();
hwReq.Method =
"Get"
;
hwReq.KeepAlive =
false
;
StreamReader reader =
new
StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding(
"GB2312"
));
return
reader.ReadToEnd();
}
// 提取HTML代码中的网址
static
ArrayList GetHyperLinks(
string
htmlCode)
{
ArrayList al =
new
ArrayList();
Regex r =
new
Regex(strRegex,RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);
for
(
int
i=0; i<=m.Count-1; i++)
{
bool
rep =
false
;
string
strNew = m[i].ToString();
// 过滤重复的URL
foreach
(
string
str
in
al)
{
if
(strNew==str)
{
rep =
true
;
break
;
}
}
if
(!rep) al.Add(strNew);
}
al.Sort();
return
al;
}
// 把网址写入xml文件
static
void
WriteToXml(
string
strURL, ArrayList alHyperLinks)
{
XmlTextWriter writer =
new
XmlTextWriter(
"HyperLinks.xml"
,Encoding.UTF8);
writer.Formatting = Formatting.Indented;
writer.WriteStartDocument(
false
);
writer.WriteDocType(
"HyperLinks"
,
null
,
"urls.dtd"
,
null
);
writer.WriteComment(
"提取自"
+ strURL +
"的超链接"
);
writer.WriteStartElement(
"HyperLinks"
);
writer.WriteStartElement(
"HyperLinks"
,
null
);
writer.WriteAttributeString(
"DateTime"
,DateTime.Now.ToString());
foreach
(
string
str
in
alHyperLinks)
{
string
title = GetDomain(str);
string
body = str;
writer.WriteElementString(title,
null
,body);
}
writer.WriteEndElement();
writer.WriteEndElement();
writer.Flush();
writer.Close();
}
联系客服