今天用到了DOM来分析
HTML,在此刻录一下:
namespace mshtml
{
[ComVisible(true), ComImport(), Guid("7FD52380-4E07-101B-AE2D-08002B2EC713"), InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
void GetClassID([In, Out] ref Guid pClassID);
[return: MarshalAs(UnmanagedType.I4)]
[PreserveSig]
int IsDirty();
void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm,
[In, MarshalAs(UnmanagedType.I4)] int fClearDirty);
void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);
void InitNew();
}
}
class DomParser
{
public static unsafe IHTMLDocument2 Parse(string html)
{
IHTMLDocument2 pDocument = new HTMLDocumentClass();
if (pDocument != null)
{
IPersistStreamInit pPersist = pDocument as IPersistStreamInit;
pPersist.InitNew();
//pPersist = null;
IMarkupServices ms = (IMarkupServices)pDocument;
if (ms != null)
{
IMarkupContainer pMC = null;
IMarkupPointer pStart, pEnd;
ms.CreateMarkupPointer(out pStart);
ms.CreateMarkupPointer(out pEnd);
StringBuilder sb = new StringBuilder(html);
IntPtr pSource = Marshal.StringToHGlobalUni(html);
ms.ParseString(ref *(ushort*)pSource.ToPointer(), 0, out pMC, pStart, pEnd);
if (pMC != null)
{
Marshal.Release(pSource);
return pMC as IHTMLDocument2;
}
Marshal.Release(pSource);
}
}
return null;
}
public static IHTMLDocument2 ParseFromUrl(string url)
{
HTMLDocument doc = new HTMLDocument();
IPersistStreamInit objIPs = doc as IPersistStreamInit;
objIPs.InitNew();
IHTMLDocument2 doc2 = doc.createDocumentFromUrl(url, null);
int x = 10;
while (doc2.readyState != "complete")
{
x++;
Application.DoEvents();
}
//IHTMLDocument3 doc3 = doc2 as IHTMLDocument3;
return doc2;
}
}
interface IContentProvider
{
byte[] GetContent(string url);
}
public class HttpContentProvider : IContentProvider
{
#region IContentProvider Members
WebClient wc = new WebClient();
public byte[] GetContent(string url)
{
return wc.DownloadData(url);
}
#endregion
}
public class FileContentProvider : IContentProvider
{
#region IContentProvider Members
public byte[] GetContent(string url)
{
using (FileStream fs = File.OpenRead(url))
{
byte[] bytes = new byte[fs.Length];
fs.Read(bytes, 0, bytes.Length);
return bytes;
}
}
#endregion
}
class TypeSpider
{
#if DEBUG
string startUrl = "site_map.html";
#else
string startUrl = "http://www.kelkoo.nl/sm_site-map.html";
#endif
public void Run()
{
IContentProvider provider = new FileContentProvider();
string html = Encoding.GetEncoding("iso-8859-1").GetString(provider.GetContent(startUrl));
IHTMLDocument2 document = DomParser.Parse(html);
}
}
class Program
{
[STAThread]
static void Main(string[] args)
{
new TypeSpider().Run();
}
}
记得要使用STA标记,不然的话,接口转换为null.