博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python实训第四天
阅读量:7103 次
发布时间:2019-06-28

本文共 4605 字,大约阅读时间需要 15 分钟。

# 爬虫全过程: #     1.发送请求(请求库) # requests # selenium #     2.获取相应数据(服务器返回) #     3.解析并提取数据(解析库) # re正则 # bs4(beautifulsoup4) # Xpath #     4.保存数据(存储库) # mongDB # 爬虫框架 # Scrapy(基于面向对象) # requests模块详细使用 # http协议: #     请求url: # #     请求方式: #     GET #     请求头: #     Cookie:可能需要关注 #     User-Agent:用来证明自己是浏览器 #import requests # response=requests.get() # print(response.status_code) # print(response.text) # 携带请求头参数访问知乎 # import requests # # 请求字典 # headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' # # } # # 在get请求内,添加user-agent # response=requests.get(url='https://www.zhihu.com/explore',headers=headers) # print(response.status_code) # with open('zhihu.html','w',encoding='utf-8') as f: #     f.write(response.text) # # 携带cookies # 携带登录cookies破解github登录验证 # 请求url:。。。。 # 请求方式:。。。。 # 请求头:。。。。 import requests url='https://home.cnblogs.com/set/' headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',          'Cookies':'_ga=GA1.2.736039140.1560498371; _gid=GA1.2.393939017.1560498371; __gads=ID=22042d69ef7c440a:T=1560498371:S=ALNI_MZjbBvbmYulhYR0hD7DDAvxO0aolQ; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrl5aDQbB7qHF12lN377FcJeizO5Dr4IA_1e7Aq8woZhTxdhKDrbe8NA3gDFqxX5fXn7Op4tblZ3WlqCLIBc9yYqTekcG0jfa9xAH-ur9i-QKr9dvFLlxL1TVSknTiV9iA9nxENBL_WJqnpg8Lo7M5DkfKd0hslNAvuFza9WE3InaBkqJom6ThPvt0z-LN0yviYk5duwVIT8HM1tfOHM2KT_ERkPqKSUTgVRKYGKWrMsG89yDtjKBL1lp0IjzQtzIzK0215tgd3fh0guFL2U994D-ZgHTQthJ0ZZErBUrZ3Z2aHMiJnHXVJLWW3NWAlRuk-R4snWbHpJt8diYsfn-P-q79Ms2SmCAKEg8Vqzf41Qb5lYT_qvGWw0vU3uZwglGwb6KycLuTwKVIXYcrrmgR_F5mFa6MnIoylo1ljVhgRROZgBVQz15SMONXFGTpaX8zI; .CNBlogsCookie=A7F62226302E1403835FB5491EFFE521C6FEB4D05375BC64EC3B87D308A75E4372DBFD7E26B197F93A52D7C4212BA3EF74F4A65A51B7CA92266DAA7F0365C3C7FE6BA6294557EF3FB7CAB11990D3E5723D5FEB51; _gat=1'} github_res=requests.get(url, headers=headers) import requests cookies={'Cookies':'_ga=GA1.2.736039140.1560498371; _gid=GA1.2.393939017.1560498371; __gads=ID=22042d69ef7c440a:T=1560498371:S=ALNI_MZjbBvbmYulhYR0hD7DDAvxO0aolQ; .Cnblogs.AspNetCore.Cookies=CfDJ8D8Q4oM3DPZMgpKI1MnYlrl5aDQbB7qHF12lN377FcJeizO5Dr4IA_1e7Aq8woZhTxdhKDrbe8NA3gDFqxX5fXn7Op4tblZ3WlqCLIBc9yYqTekcG0jfa9xAH-ur9i-QKr9dvFLlxL1TVSknTiV9iA9nxENBL_WJqnpg8Lo7M5DkfKd0hslNAvuFza9WE3InaBkqJom6ThPvt0z-LN0yviYk5duwVIT8HM1tfOHM2KT_ERkPqKSUTgVRKYGKWrMsG89yDtjKBL1lp0IjzQtzIzK0215tgd3fh0guFL2U994D-ZgHTQthJ0ZZErBUrZ3Z2aHMiJnHXVJLWW3NWAlRuk-R4snWbHpJt8diYsfn-P-q79Ms2SmCAKEg8Vqzf41Qb5lYT_qvGWw0vU3uZwglGwb6KycLuTwKVIXYcrrmgR_F5mFa6MnIoylo1ljVhgRROZgBVQz15SMONXFGTpaX8zI; .CNBlogsCookie=A7F62226302E1403835FB5491EFFE521C6FEB4D05375BC64EC3B87D308A75E4372DBFD7E26B197F93A52D7C4212BA3EF74F4A65A51B7CA92266DAA7F0365C3C7FE6BA6294557EF3FB7CAB11990D3E5723D5FEB51; _gat=1'} github_res=requests.get(url,headers=headers,cookies=cookies) print('2545976330@qq.com' in github_res.text) 多线程爬虫
# 3.6python解释器 3.7报错shutdown import requests import re import uuid from concurrent.futures import ThreadPoolExecutor#导入线程池模块 pool=ThreadPoolExecutor(50)#线程池限制50个线程 # 爬虫三部曲 # 1.发送请求 def get_page(url):     print(f'开始异步任务:{url}')     response=requests.get(url)     return response # 2.解析数据 # 解析主页获取视频详情ID def parse_index(res):     response=res.result()     id_list = re.findall('
'''''' ''' 主页:     https://movie.douban.com/top250     GET     User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 re正则:     # 电影详情页url、图片链接、电影名称、电影评分、评价人数     
.*?href="(.*?)">.*?src="(.*?)".*?
(.*?).*?
(.*?).*?
(.*?)人评价 ''' import requests import re url = 'https://movie.douban.com/top250' headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } # 1、往豆瓣TOP250发送请求获取响应数据 response = requests.get(url, headers=headers) # print(response.text) # 2、通过正则解析提取数据 # 电影详情页url、图片链接、电影名称、电影评分、评价人数 movie_content_list = re.findall( # 正则规则 '
.*?href="(.*?)">.*?src="(.*?)".*?
(.*?).*?
(.*?).*?
(.*?)人评价', # 解析文本 response.text, # 匹配模式 re.S) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, point, num = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} \n' print(data) # 3、保存数据,把电影信息写入文件中 with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data)

 

 
 
 

转载于:https://www.cnblogs.com/7777qqq/p/11025103.html

你可能感兴趣的文章
[LeetCode]Longest Consecutive Sequence
查看>>
NCBI通过氨基酸位置查看相邻SNP
查看>>
CAS SSO单点登录框架学习
查看>>
好书推荐——《启动大脑》
查看>>
网络流24题 -No.17 运输问题
查看>>
MySQL数据库的主从复制简单学习使用
查看>>
kprobe原理与实现笔记
查看>>
sql语句优化
查看>>
Topological Sorting
查看>>
sql语句大全
查看>>
MongoDB server side Javascript 如何直接传入字符串?
查看>>
ZooKeeper学习第五期--ZooKeeper管理分布式环境中的数据
查看>>
羽毛球比赛
查看>>
php基础
查看>>
(实用篇)PHP缓存类完整实例
查看>>
iOS7.1 编译报错 解决方案 体会
查看>>
maven常见命令总结
查看>>
LightLife小组Alfha冲刺(第六天)
查看>>
U盘用FAT32还是用NTFS格式好
查看>>
Elasticsearch Java教程
查看>>