您的位置:首页技术文章
文章详情页

python - scrapy爬取手机版微博weibo.cn模拟登录出先问题

【字号: 日期:2022-08-03 14:03:35浏览:59作者:猪猪

问题描述

代码如下,不知道为什么一直不能成功登录

># -*- coding: utf-8 -*-import scrapyimport reimport requests#import urllibfrom bs4 import BeautifulSoupfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom scrapy.loader import ItemLoaderfrom scrapy.loader.processors import MapCompose, Joinfrom scrapy.http import Request,FormRequestfrom getweibo.items import InformationItem,TweetsItemloginURL = 'https://login.weibo.cn/login/'#获得验证码等信息def get_captchainfo(loginURL): html = requests.get(loginURL).content bs = BeautifulSoup(html,'lxml') #print bs #注意通过bs.select元素寻找对象,返回的是列表对象 password_name = (bs.select(’input[type='password']’))[0].get(’name’) vk = (bs.select(’input[name='vk']’))[0].get(’value’) capId = (bs.select(’input[name='capId']’))[0].get(’value’) #print password_name,vk,capId captcha_img = bs.find('img', src=re.compile(’http://weibo.cn/interface/f/ttt/captcha/’)).get(’src’) print captcha_img #captchaid可以从验证码图片地址中直接截取获得 #urllib.urlretrieve(captcha_img, ’weibo_spider/image/captcha.jpg’) #print 'captcha download success!' captcha_input = raw_input('please input the captchan>') return (captcha_input,password_name,vk,capId)class WeiboSpider(CrawlSpider): name = ’weibo’ allowed_domains = [’weibo.cn’] start_urls = [’http://weibo.cn/dafendi’]#先暂时确定精分君的微博,之后start_urls可以从文件提取 rules = (Rule(LinkExtractor(restrict_xpaths=’//*[@id='pagelist']/form/p/a’)),Rule(LinkExtractor(restrict_xpaths=’//*[contains(@href,'repost')]’),callback=’parse_item’) ) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Content-Type':' application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Referer': 'https://login.weibo.cn/login/' } # Start on the welcome page def start_requests(self):return [ Request(loginURL,meta = {’cookiejar’: 1},headers=self.headers,callback=self.parse_login)] # Post welcome page’s first form with the given user/pass def parse_login(self, response):print ’Preparing login’captcha=get_captchainfo(loginURL)print captcha return FormRequest.from_response( response,#from loginURL method='POST', meta = {’cookiejar’ : response.meta[’cookiejar’]},#获取cookies headers = self.headers, formdata = { 'mobile': '帐号', captcha[1]: '密码', 'code': captcha[0], 'remember':'on', 'backurl': 'http%3A%2F%2Fweibo.cn', 'backtitle':u’手机新浪网’, 'tryCount':'', 'vk': captcha[2], 'capId': captcha[3], 'submit': u’登录’}, callback = self.after_login, dont_filter = True) def after_login(self, response) :for url in self.start_urls : yield self.make_requests_from_url(url) def parse_start_url(self, response):#用来处理初始responsehtml = response.xpath(’/html’).extract()print html # Create the loader using the response l = ItemLoader(item=InformationItem(), response=response) # Load fields using XPath expressionsl.add_xpath(’id_’, ’//title/text()’, MapCompose(lambda i:i[0:len(i)-3])),l.add_xpath(’Info’,’//span[contains(@class,'ctt')][2]/text()’),l.add_xpath(’Num_Tweets’,’//span[contains(@class,'tc')]/text()’,MapCompose(lambda i: i[(i.index('[')+1):(i.index(']'))])),l.add_xpath(’Num_Follows’,’//a[contains(@href,'follow')]/text()’,MapCompose(lambda i: i[(i.index('[')+1):(i.index(']'))])),l.add_xpath(’Num_Fans’,’//a[contains(@href,'fans')]/text()’,MapCompose(lambda i: i[(i.index('[')+1):(i.index(']'))])),return l.load_item() def parse_item(self, response): l = ItemLoader(item=TweetsItem(), response=response) l.add_xpath(’Content’,’//span[contains(@class,'ctt')]/text()’) #l.add_xpath(’’) return l.load_item()

下边settins.py的内容

ROBOTSTXT_OBEY = FalseHTTPERROR_ALLOWED_CODES = [302,]#返回400时按正常的返回对待REDIRECT_ENABLED = False #关掉重定向,不会重定向到新的地址DOWNLOAD_DELAY = 3COOKIES_ENABLED = TrueCOOKIES_DEBUG = True

下边是输出

2017-04-09 15:53:17 [scrapy] DEBUG: Sending cookies to: <POST https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>Cookie: _T_WM=6348fb8a523fe1bc486f14d1304cf0d22017-04-09 15:53:19 [scrapy] DEBUG: Received cookies from: <302 https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cnSet-Cookie: SUB=_2A2517Zg9DeRhGeVG61ER8yrEwzyIHXVXETh1rDV6PUJbkdAKLRXgkW0wSZc8S6dp1d-NlyAraSqa-1-_0Q..; expires=Tue, 09-May-2017 07:53:17 GMT; path=/; domain=.weibo.cn; httponlySet-Cookie: gsid_CTandWM=4uuCcdef1lRXUEnMtsgL1fXlgec; expires=Tue, 09-May-2017 07:53:19 GMT; path=/; domain=.weibo.cn; httponly2017-04-09 15:53:19 [scrapy] DEBUG: Crawled (302) <POST https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4> (referer: https://login.weibo.cn/login/)2017-04-09 15:53:20 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/dafendi>Set-Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0; expires=Tue, 09-May-2017 07:53:19 GMT; path=/; domain=.weibo.cn; httponlySet-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn2017-04-09 15:53:20 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/dafendi> (referer: https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4)2017-04-09 15:53:20 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/dafendi>{’Info’: [u’u8ba4u8bc1uff1au77e5u540du5e7du9ed8u535au4e3b u5faeu535au7b7eu7ea6u81eau5a92u4f53’], ’Num_Fans’: [u’2055326’], ’Num_Follows’: [u’891’], ’Num_Tweets’: [u’1958’], ’id_’: [u’u7cbeu5206u541b’]}2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c02017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c02017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c02017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c02017-04-09 15:53:24 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn2017-04-09 15:53:24 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)2017-04-09 15:53:24 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>{’Content’: [u’:’, u’ u5047u5982u4efbu4f55u4e8bu90fdu80fdu6210u4e3au804cu4e1auff0cu4f60u4f1au9009u62e9u4ec0u4e48u4f5cu4e3au804cu4e1auff1f u200bu200bu200b’]}2017-04-09 15:53:28 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn2017-04-09 15:53:28 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)2017-04-09 15:53:28 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>{’Content’: [u’u7279u522bu7684u751fu65e5u793cu7269u3002 u200bu200bu200b’]}2017-04-09 15:53:32 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn2017-04-09 15:53:32 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)2017-04-09 15:53:32 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>{’Content’: [u’u7231u7b11u7684u5973u5b69u5b50uff0cu8fd0u6c14u4e00u5b9au4e0du4f1au592au597du2026u2026’, u’ u200bu200bu200b’]}2017-04-09 15:53:36 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn2017-04-09 15:53:36 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)2017-04-09 15:53:36 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>{’Content’: [u’:u4e00u4e2au957fu5faeu535au5408u96c6uff0cu5927u5bb6u65e0u804au53c8u6ca1u770bu8fc7u7684u8bddu53efu4ee5u770bu770b[u7f9eu55d2u55d2] u200bu200bu200b’]}2017-04-09 15:53:36 [scrapy] INFO: Closing spider (finished)2017-04-09 15:53:36 [scrapy] INFO: Stored json feed (5 items) in: wanghongmingdan.json2017-04-09 15:53:36 [scrapy] INFO: Dumping Scrapy stats:{’downloader/request_bytes’: 3029, ’downloader/request_count’: 7, ’downloader/request_method_count/GET’: 6, ’downloader/request_method_count/POST’: 1, ’downloader/response_bytes’: 22746, ’downloader/response_count’: 7, ’downloader/response_status_count/200’: 6, ’downloader/response_status_count/302’: 1, ’finish_reason’: ’finished’, ’finish_time’: datetime.datetime(2017, 4, 9, 7, 53, 36, 596076), ’item_scraped_count’: 5, ’log_count/DEBUG’: 27, ’log_count/INFO’: 8, ’log_count/WARNING’: 2, ’request_depth_max’: 3, ’response_received_count’: 7, ’scheduler/dequeued’: 7, ’scheduler/dequeued/memory’: 7, ’scheduler/enqueued’: 7, ’scheduler/enqueued/memory’: 7, ’start_time’: datetime.datetime(2017, 4, 9, 7, 53, 2, 180831)}2017-04-09 15:53:36 [scrapy] INFO: Spider closed (finished)2017-04-09 20:11:50 [scrapy] DEBUG: Redirecting (302) to <GET http://weibo.cn/crossDomain/?g=4uegcdef1d93rkj4S3ZomfXlgec&t=1491739909&m=9144&r=&u=http%3A%2F%2Fweibo.cn%3Fgsid%3D4uegcdef1d93rkj4S3ZomfXlgec%26PHPSESSID%3D%26vt%3D4&cross=1&st=ST-MzgwMzAzNDg4MA==-1491739909-tc-27ED8C8D7528C9185E75F7986B8050B7-1,ST-MzgwMzAzNDg4MA==-1491739909-tc-BED83CC16AC311D2BBA234E8F08BBD39-1> from <POST https://login.weibo.cn/login/?rand=842328789&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>2017-04-09 20:11:50 [scrapy] DEBUG: Redirecting (meta refresh) to <GET http://weibo.cn/> from <GET http://weibo.cn/crossDomain/?g=4uegcdef1d93rkj4S3ZomfXlgec&t=1491739909&m=9144&r=&u=http%3A%2F%2Fweibo.cn%3Fgsid%3D4uegcdef1d93rkj4S3ZomfXlgec%26PHPSESSID%3D%26vt%3D4&cross=1&st=ST-MzgwMzAzNDg4MA==-1491739909-tc-27ED8C8D7528C9185E75F7986B8050B7-1,ST-MzgwMzAzNDg4MA==-1491739909-tc-BED83CC16AC311D2BBA234E8F08BBD39-1>

问题解答

回答1:

建议你在做模拟登陆的时候,打开抓包软件抓包,进行调试,这样你才能知道通过程序请求目标服务器返回的内容和你手动请求服务器返回的内容是否有差异。对于微博数据采集我也有一定的经验,我刚看了你的代码,发现和我以前写的模拟登陆微博有一定的差异,这是我的代码,我刚检查了还能用。我又去对比了一下我两代码的差异,发现你虽然是抓的wap版微博,但是你的UA用的是PC端的UA,所以会弹出验证码,提交的参数也不相同。你的代码出错应该是有一步跳转需要手动访问,你没有进行访问,这个你可以抓包看看。感觉现在微博wap端的反爬也开始重视起来了啊。如果想更好的理解模拟登陆微博,可以看看我的这篇文章。截至现在,该方法都可用

标签: 微博 Python
相关文章: