一 requests請求庫爬取豆瓣電影信息
?
- 請求方式
? GET
?
- 請求頭
? user-agent
? cookies
import requests import re def get_page(url):response=requests.get(url)return responsedef parse_index(html):movie_list = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?導演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人評價</span>.*?<span class="inq">(.*?)</span>',html,re.S)return movie_listdef save_data(movie):top,m_url,name,daoyan,actor,year_type,point,commit,desc=movieyear_type=year_type.strip('\n')data=f''' ===============================電影排名:{top}電影url:{m_url}電影名稱:{name}電影導演:{daoyan}電影主演:{actor}電影類型:{year_type}電影評分:{point}電影評論:{commit}電影簡介:{desc}===============================\n''' print(data)with open('douban_top250.txt','a',encoding='utf-8')as f:f.write(data)print(f'電影:{name}寫入成功...')if __name__ == '__main__':num=0for line in range(10):url=f'https://movie.douban.com/top250?start={num}&filter='num+=25print(url)index_res=get_page(url)movie_list=parse_index(index_res.text)for movie in movie_list:save_data(movie)
?
2、為什么要使用selenium?
優點:
- 執行js代碼
- 不需要分析復雜的通信流程
- 對瀏覽器做彈窗、下拉等操作
- ***** 獲取動態數據
- *** 破解登錄驗證
缺點:
- 執行效率低
3、安裝與使用
1. 安裝selenium請求庫:
pip3 install selenium
selenium。 2. 必須安裝瀏覽器
"谷歌"或者火狐
3.安裝瀏覽器驅動
http://npm.taobao.org/mirrors/chromedriver/2.38/
windows:
下載win32驅動
?
selenium請求庫爬蟲京東網進程
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import timeimport time driver= webdriver.Chrome(r'C:\Users\HP\Desktop\chromedriver.exe') try:driver.get('https://www.jd.com/')wait = WebDriverWait(driver, 10)input_tag = wait.until(EC.presence_of_element_located((By.ID, 'key')))time.sleep(5)input_tag.send_keys('公仔')input_tag.send_keys(Keys.ENTER)time.sleep(20) finally:driver.close()
?
selenium python。selenium請求庫爬蟲登錄百度
from selenium import webdriver from selenium.webdriver.common.keys import Keys import timeimport time driver= webdriver.Chrome(r'C:\Users\HP\Desktop\chromedriver.exe') try:driver.implicitly_wait(10)driver.get('https://www.baidu.com/')time.sleep(5)login_link = driver.find_element_by_link_text('登錄')login_link.click()time.sleep(1)user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')user_login.click()time.sleep(1)# 3、find_element_by_class_nameuser = driver.find_element_by_class_name('pass-text-input-userName')user.send_keys('1847702753@qq.com')# 4、find_element_by_namepwd = driver.find_element_by_name('password')pwd.send_keys('*****')submit = driver.find_element_by_id('TANGRAM__PSP_10__submit')submit.click()login_link = driver.find_element_by_partial_link_text('登')login_link.click()login2_link = driver.find_element_by_css_selector('.tang-pass-footerBarULogin')login2_link.click()div = driver.find_elements_by_tag_name('div')print(div)time.sleep(20) finally:driver.close()
?