python爬虫爬取数据并以json格式输出成文件
·
import time
import json
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
class Test:
url = 'http://www.test.com/hello'
options = webdriver.ChromeOptions()
# 不加载图片,加快访问速度
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 添加本地代理
# options.add_argument("--proxy--server=127.0.0.1:8080")
# 添加UA
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
# ua = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)'
# ua = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
options.add_argument('user-agent=' + ua)
driver = webdriver.Chrome(options=options)
driver.maximize_window()
wait = WebDriverWait(driver, 10)
driver.get(url)
time.sleep(3)
# driver.refresh()
result = []
items = driver.find_elements_by_css_selector('div.ex-item')
for item in items:
name = item.find_element_by_css_selector('div.ex-item-bottom div.title').text
print(name)
address = item.find_element_by_css_selector('div.ex-item-bottom p.f-toe').text
print(address)
exhi_time = item.find_element_by_css_selector('div.ex-item-bottom p.item-line').text
print(exhi_time)
image = item.find_element_by_css_selector('div.ex-item-top img').get_attribute('src')
print(image)
one = {}
one['name'] = name
one['address'] = address
one['exhi_time'] = exhi_time
one['image'] = image
result.append(one)
with open('d:\\data.json', 'w', encoding='utf-8') as file:
file.write(json.dumps(result, indent=2, ensure_ascii=False))
time.sleep(10)
# driver.close() # 关闭浏览器
输出json格式的文件内容示例如下:
[
{
"name": "2020年上海国际展",
"address": "上海世博展览馆",
"exhi_time": "2020/03/24~03/26",
"image": "https://show.test.com/show/imgs/202003/61a840a1373f45122d4e.jpg"
},
{
"name": "中国国际产业展览会",
"address": "上海市徐汇区漕宝路88号",
"exhi_time": "2020/04/10~04/12",
"image": "https://show.test.com/show/imgs/202003/289f27cb5513fad11.jpg"
}
]
本文内容到此结束。

新一代开源开发者平台 GitCode,通过集成代码托管服务、代码仓库以及可信赖的开源组件库,让开发者可以在云端进行代码托管和开发。旨在为数千万中国开发者提供一个无缝且高效的云端环境,以支持学习、使用和贡献开源项目。
更多推荐
所有评论(0)