惟愿终日无丝竹之乱耳,无案牍之劳形。看那平心所至处,皆为仙境。

C#中使用Dom来分析HTML

上一篇 / 下一篇  2007-08-29 12:49:05 / 个人分类:搞搞技术

今天用到了DOM来分析HTML,在此刻录一下:

namespace mshtml
{
    [ComVisible(true), ComImport(), Guid("7FD52380-4E07-101B-AE2D-08002B2EC713"), InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
    public interface IPersistStreamInit
    {
        void GetClassID([In, Out] ref Guid pClassID);
        [return: MarshalAs(UnmanagedType.I4)]
        [PreserveSig]
        int IsDirty();
        void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
        void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm,
            [In, MarshalAs(UnmanagedType.I4)] int fClearDirty);
        void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);
        void InitNew();
    }
}

    class DomParser
    {
        public static unsafe IHTMLDocument2 Parse(string html)
        {
            IHTMLDocument2 pDocument = new HTMLDocumentClass();
            if (pDocument != null)
            {
                IPersistStreamInit pPersist = pDocument as IPersistStreamInit;
                pPersist.InitNew();
                //pPersist = null;
                IMarkupServices ms = (IMarkupServices)pDocument;
                if (ms != null)
                {
                    IMarkupContainer pMC = null;
                    IMarkupPointer pStart, pEnd;
                    ms.CreateMarkupPointer(out pStart);
                    ms.CreateMarkupPointer(out pEnd);
                    StringBuilder sb = new StringBuilder(html);
                    IntPtr pSource = Marshal.StringToHGlobalUni(html);
                    ms.ParseString(ref *(ushort*)pSource.ToPointer(), 0, out pMC, pStart, pEnd);
                    if (pMC != null)
                    {
                        Marshal.Release(pSource);
                        return pMC as IHTMLDocument2;
                    }
                    Marshal.Release(pSource);
                }
            }
            return null;
        }

        public static IHTMLDocument2 ParseFromUrl(string url)
        {
            HTMLDocument doc = new HTMLDocument();
            IPersistStreamInit objIPs = doc as IPersistStreamInit;
            objIPs.InitNew();

            IHTMLDocument2 doc2 = doc.createDocumentFromUrl(url, null);
            int x = 10;

            while (doc2.readyState != "complete")
            {
                x++;
                Application.DoEvents();
            }

            //IHTMLDocument3 doc3 = doc2 as IHTMLDocument3;
            return doc2;
        }
    }

    interface IContentProvider
    {
        byte[] GetContent(string url);
    }

    public class HttpContentProvider : IContentProvider
    {
        #region IContentProvider Members

        WebClient wc = new WebClient();
        public byte[] GetContent(string url)
        {
            return wc.DownloadData(url);
        }

        #endregion
    }

    public class FileContentProvider : IContentProvider
    {
        #region IContentProvider Members

        public byte[] GetContent(string url)
        {
            using (FileStream fs = File.OpenRead(url))
            {
                byte[] bytes = new byte[fs.Length];
                fs.Read(bytes, 0, bytes.Length);

                return bytes;
            }
        }

        #endregion
    }

class TypeSpider
    {
#if DEBUG
        string startUrl = "site_map.html";
#else
    string startUrl = "http://www.kelkoo.nl/sm_site-map.html";
#endif

        public void Run()
        {
            IContentProvider provider = new FileContentProvider();
            string html = Encoding.GetEncoding("iso-8859-1").GetString(provider.GetContent(startUrl));

            IHTMLDocument2 document = DomParser.Parse(html);
         }
}

    class Program
    {
        [STAThread]
        static void Main(string[] args)
        {
            new TypeSpider().Run();
        }
    }

记得要使用STA标记,不然的话,接口转换为null.     

相关阅读:

TAG: domparser html imarketservices

引用 删除 Guest   /   2008-06-16 17:32:13
我觉得这里的这种思路不错:
http://blog.csdn.net/RonoTian/archive/2008/06/06/2517568.aspx

感觉新颖。
引用 删除 Guest   /   2008-06-16 17:31:30
-3
引用 删除 Guest   /   2008-06-13 16:28:47
-3
引用 删除 Guest   /   2008-05-23 22:26:23
-1
引用 删除 Guest   /   2007-12-19 16:17:51
3
SKYOVER之陋室 引用 删除 skyover   /   2007-08-29 15:20:36
ParseFromUrl 会下载网页上的图片等东西。
 

评分:0

我来说两句

显示全部

:loveliness: :handshake :victory: :funk: :time: :kiss: :call: :hug: :lol :'( :Q :L ;P :$ :P :o :@ :D :( :)

Open Toolbar