C#爬虫-Selenium ChromeDriver 设置代理

背景

开发爬虫程序,如果不做代理设置,本机的外网IP很容易被网站封掉,导致不能持续进行数据抓取。而Selenium作为动态网页抓取的利器,我们有必要了解一下,如何对它进行代理设置,并正常访问网页。.

解决办法

1、首先申请代理ip,正常付费的才比较靠谱。这其中包括账号、密码。

  private string proxy_Host = "域名地址";        private int proxy_Post = 端口;        private string proxy_UserName = "账号";        private string proxy_PassWord = "密码";        private string proxy_CheckURL = "检查是否正常的地址";        private string Ex_Proxy_Name = "proxy.zip";

2、设置chrome background.js、manifest.json

  private bool Rebuild_Extension_Proxy(string proxy_UserName, string proxy_PassWord)        {            bool result = false;
            FileStream zipToOpen = null;            ZipArchive archive = null;            ZipArchiveEntry readmeEntry = null;            StreamWriter writer = null;            string background = "";            string manifest = "";
            try            {                background = @"                var Global = {                    currentProxyAouth:                    {                        username: '',                        password: ''                    }                }
                Global.currentProxyAouth = {                        username: '" + proxy_UserName + @"',                        password: '" + proxy_PassWord + @"'                }
                chrome.webRequest.onAuthRequired.addListener(                    function(details, callbackFn) {                        console.log('onAuthRequired >>>: ', details, callbackFn);                        callbackFn({                            authCredentials: Global.currentProxyAouth                        });                    }, {                        urls: [""<all_urls>""]                    }, [""asyncBlocking""]);
                chrome.runtime.onMessage.addListener(                    function(request, sender, sendResponse) {                        console.log('Background recieved a message: ', request);
                        POPUP_PARAMS = {};                        if (request.command && requestHandler[request.command])                            requestHandler[request.command] (request);                    }                );";
                manifest = @"                {                    ""version"": ""1.0.0"",                    ""manifest_version"": 2,                    ""name"": ""Chrome Proxy"",                    ""permissions"": [                        ""proxy"",                        ""tabs"",                        ""unlimitedStorage"",                        ""storage"",                        ""<all_urls>"",                        ""webRequest"",                        ""webRequestBlocking""                    ],                    ""background"": {                        ""scripts"": [""background.js""]                    },                    ""minimum_chrome_version"":""22.0.0""                }";
                zipToOpen = new FileStream(System.Environment.CurrentDirectory + "\\" + Ex_Proxy_Name, FileMode.Create);                archive = new ZipArchive(zipToOpen, ZipArchiveMode.Update);
                readmeEntry = archive.CreateEntry("background.js");                writer = new StreamWriter(readmeEntry.Open());                writer.WriteLine(background);                writer.Close();
                readmeEntry = archive.CreateEntry("manifest.json");                writer = new StreamWriter(readmeEntry.Open());                writer.WriteLine(manifest);                writer.Close();                result = true;            }            catch (Exception ex)            {                result = false;            }            finally            {                if (writer != null) { writer.Close(); writer.Dispose(); writer = null; }                if (readmeEntry != null) { readmeEntry = null; }                if (archive != null) { archive.Dispose(); archive = null; }                if (zipToOpen != null) { zipToOpen.Close(); zipToOpen.Dispose(); zipToOpen = null; }            }
            return result;        }

3、Chrome Driver使用代理Proxy

 // 設置 Chrome Driver Exyension Proxy 設定                bool isproxysetting = true;                if (_isuseproxy)                {                    isproxysetting = Rebuild_Extension_Proxy(proxy_UserName, proxy_PassWord);                }
                if (isproxysetting)                {                    // Driver 設定                    options = new ChromeOptions();                    if (_isuseproxy)                    {                        options.Proxy = null;                        options.AddArguments("--proxy-server=" + proxy_Host + ":" + proxy_Post.ToString());                        options.AddExtension(Ex_Proxy_Name);                    }

4、测试一下我们的设置

  private Proxy_Unit.ProxyIPInfo Get_ProxyIPInfo(string Html_Content)        {            Proxy_Unit.ProxyIPInfo result = null;
            try            {                result = new Proxy_Unit.ProxyIPInfo();
                Html_Content = Html_Content.Replace("<html><head></head><body><pre style=\"word-wrap: break-word; white-space: pre-wrap;\">", "");                Html_Content = Html_Content.Replace("</pre></body></html>", "");                if (!Html_Content.Contains("proxy error"))                {                    result = JsonConvert.DeserializeObject<Proxy_Unit.ProxyIPInfo>(Html_Content);                }                else                {                    result = null;                }            }            catch (Exception ex)            {                result = null;            }
            return result;        }

测试效果

成功,达到预期效果

{    "ip":"213.182.205.185",    "country":"IS",    "asn":{        "asnum":9009,        "org_name":"M247 Ltd"    },    "geo":{        "city":"Reykjavik",        "region":"1",        "region_name":"Capital Region",        "postal_code":"105",        "latitude":64.1369,        "longitude":-21.9139,        "tz":"Atlantic/Reykjavik",        "lum_city":"reykjavik",        "lum_region":"1"    }}

总结

我们之前测试要为ChromeDriver设定Proxy时有遇到许多困难,需要使用Chrome Extension的管道设定Proxy才成功,以上希望能让您比较好了解。