selenium+headless chrome爬虫的实现示例
python爬虫写起来非常快,虽然也可以用java,但是没有python来的简洁迅速
selenium在前面总结过,是一个自动化测试库。headlesschrome是无界面的浏览器模式,和PHANTOMJS类似。但是PHANTOMJS往往会出现莫名的错误,而且速度没有headlesschrome快
fromselenium.webdriver.chrome.optionsimportOptions
globalDRIVER
chrome_options=Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
DRIVER=webdriver.Chrome(chrome_options=chrome_options)
爬虫的代码有一点需要注意,需要操作事件的时候最好不要直接用相应的方法,比如click。最好嵌入js脚本的方式进行调用。因为爬虫的代码执行速度很快,前端元素结构往往反应不过来,从而找出元素不可见或者不存在的错误。
province_items=DRIVER.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
#province_item.click()
DRIVER.execute_script('arguments[0].click();',province_item)
下面来个例子,由于做电商平台,省、市、区的数据很好找,但是没有镇、街道的信息。这里通过爬虫从淘宝网将镇,街道的信息抓取下来
#!/usr/local/bin/python
#encoding:utf-8
'''
Createdon2018年1月5日
@author:wulinfeng
@date:2018-1-5
'''
importtime
#importrequest
fromseleniumimportwebdriver
#fromselenium.webdriver.common.desired_capabilitiesimportDesiredCapabilities
fromselenium.webdriver.chrome.optionsimportOptions
importpymysql
definit_db():
globalCONNECTION
CONNECTION=pymysql.connect("地址","用户名","密码","数据库",use_unicode=True,charset="utf8")
definit_web_driver():
globalDRIVER
#DRIVER=webdriver.PhantomJS(executable_path='C:\phantomjs-1.9.2-windows\phantomjs.exe')
#DRIVER.set_window_size(1920,1080)
'''
dcap=dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"]=(
"Mozilla/5.0(WindowsNT6.3;WOW64)AppleWebKit/537.36"
"(KHTML,likeGecko)Chrome/53.0.2785.116Safari/537.36"
)
dcap["phantomjs.page.settings.viewportSize"]=(
"width:1920,"
"height:1080"
)
DRIVER=webdriver.PhantomJS(executable_path='C:\phantomjs-1.9.2-windows\phantomjs.exe',desired_capabilities=dcap)
DRIVER.set_window_size(1920,1080)
'''
chrome_options=Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
DRIVER=webdriver.Chrome(chrome_options=chrome_options)
#DRIVER=webdriver.Ie()
#DRIVER=webdriver.Chrome()
defclose_db():
CONNECTION.close()
defclose_web_driver():
DRIVER.quit()
deflogin_taobao(username,password):
DRIVER.get("https://member1.taobao.com/member/fresh/deliver_address.htm?spm=a1z08.2.0.0.7dad47611Wnj46")
#DRIVER.get("https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.201864-2.d1.7d2082a4FxukGr&f=top&redirectURL=http%3A%2F%2Fwww.taobao.com%2F")
#选择登陆方式
DRIVER.find_element_by_xpath("//*[@id=\"J_Quick2Static\"]").click()
#登陆
input_user=DRIVER.find_element_by_xpath("//*[@id=\"TPL_username_1\"]")
input_user.clear()
input_user.send_keys(username)
DRIVER.find_element_by_xpath("//*[@id=\"TPL_password_1\"]").send_keys(password)
DRIVER.find_element_by_xpath("//*[@id=\"J_SubmitStatic\"]").click();
time.sleep(0.5)
defget_data():
#点击地址选择
#DRIVER.find_element_by_xpath("//*[@id=\"city-title\"]").click()
city_title=DRIVER.find_element_by_id("city-title")
DRIVER.execute_script('arguments[0].click();',city_title)
get_province_and_sub()
defget_province_and_sub():
#获得省列表
province_items=DRIVER.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
forprovince_iteminprovince_items:
pid=province_item.get_attribute("attr-id");
pname=province_item.get_attribute("title");
ifpid=="-1":
print("continueprovince")
continue
sql="insertintoregion_province_t(province_id,province)values('"+pid+"','"+pname+"')"
print(sql)
cursor=CONNECTION.cursor()
cursor.execute(sql)
CONNECTION.commit()
#province_item.click()
DRIVER.execute_script('arguments[0].click();',province_item)
time.sleep(0.5)
get_city_and_sub(pid)
back_tab(0)
defget_city_and_sub(pid):
#获得市列表
city_items=DRIVER.find_element_by_class_name("city-city").find_elements_by_tag_name("a")
forcity_itemincity_items:
cid=city_item.get_attribute("attr-id");
cname=city_item.get_attribute("title");
ifcid=="-1":
print("continuecity")
continue
sql="insertintoregion_city_t(city_id,city,province_id)values('"+cid+"','"+cname+"','"+pid+"')"
print(sql)
cursor=CONNECTION.cursor()
cursor.execute(sql)
CONNECTION.commit()
#city_item.click()
DRIVER.execute_script('arguments[0].click();',city_item)
time.sleep(1)
get_area_and_sub(cid)
back_tab(1)
defget_area_and_sub(cid):
#获得县区列表
area_items=DRIVER.find_element_by_class_name("city-district").find_elements_by_tag_name("a")
forarea_iteminarea_items:
aid=area_item.get_attribute("attr-id");
aname=area_item.get_attribute("title");
ifaid=="-1":
print("continuearea")
continue
sql="insertintoregion_area_t(area_id,area,city_id)values('"+aid+"','"+aname+"','"+cid+"')"
print(sql)
cursor=CONNECTION.cursor()
cursor.execute(sql)
CONNECTION.commit()
#area_item.click()
DRIVER.execute_script('arguments[0].click();',area_item)
time.sleep(0.5)
get_town_and_sub(aid)
back_tab(2)
defget_town_and_sub(aid):
#获得镇列表
town_items=DRIVER.find_element_by_class_name("city-street").find_elements_by_tag_name("a")
fortown_itemintown_items:
tid=town_item.get_attribute("attr-id");
tname=town_item.get_attribute("title");
iftid=="-1":
print("continuetown")
continue
sql="insertintoregion_town_t(town_id,town,area_id)values('"+tid+"','"+tname+"','"+aid+"')"
print(sql)
cursor=CONNECTION.cursor()
cursor.execute(sql)
CONNECTION.commit()
defback_tab(index):
districtEle=DRIVER.find_element_by_class_name("city-select-tab").find_elements_by_tag_name("a")[index]
DRIVER.execute_script('arguments[0].click();',districtEle)
time.sleep(0.5)
init_db()
init_web_driver()
login_taobao("用户名","密码")
get_data()
close_db()
close_web_driver()
到此这篇关于selenium+headlesschrome爬虫的实现示例的文章就介绍到这了,更多相关selenium+headlesschrome爬虫内容请搜索毛票票以前的文章或继续浏览下面的相关文章希望大家以后多多支持毛票票!