close
網路爬蟲 (Crawler) 的設計
簡介
原始程式
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
class WebCrawler
{
// WebProxy proxy = new WebProxy("http://proxy.internal:3128/", true);
List<String> urlList = new List<String>();
// Dictionary<String, String>
public static void Main(String[] args)
{
WebCrawler crawler = new WebCrawler();
crawler.urlList.Add("http://tw.msn.com/");
crawler.craw();
}
public void craw()
{
int urlIdx = 0;
while (urlIdx < urlList.Count)
{
try
{
String url = urlList[urlIdx];
String fileName = "data/" + toFileName(url);
Console.WriteLine(urlIdx + ":url=" + url + " file=" + fileName);
urlToFile(url, fileName);
String html = fileToText(fileName);
foreach (String childUrl in matches("\\shref\\s*=\\s*\"(.*?)\"", html, 1))
{
Console.WriteLine(childUrl);
urlList.Add(childUrl);
}
}
catch
{
Console.WriteLine("Error:" + urlList[urlIdx] + " fail!");
}
urlIdx++;
}
}
public static IEnumerable matches(String pPattern, String pText, int pGroupId)
{
Regex r = new Regex(pPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
for (Match m = r.Match(pText); m.Success; m = m.NextMatch())
yield return m.Groups[pGroupId].Value;
}
public static String fileToText(String filePath)
{
StreamReader file = new StreamReader(filePath);
String text = file.ReadToEnd();
file.Close();
return text;
}
public void urlToFile(String url, String file)
{
WebClient webclient = new WebClient();
// webclient.Proxy = proxy;
webclient.DownloadFile(url, file);
}
public static String toFileName(String url)
{
String fileName = url.Replace('?', '_');
fileName = fileName.Replace('/', '_');
fileName = fileName.Replace('&', '_');
fileName = fileName.Replace(':', '_');
fileName = fileName.ToLower();
if (!fileName.EndsWith(".htm") && !fileName.EndsWith(".html"))
fileName = fileName + ".htm";
return fileName;
}
}
陳鍾誠 (2010年06月15日),(網頁標題) C# : 網路爬蟲 (Crawler) 的設計,(網站標題) 免費電子書:C# 程式設計,2010年06月15日,取自 http://cs0.wikidot.com/crawler ,網頁修改第 1 版。
文章標籤
全站熱搜