Scraper
[TOC]
urllib
Python built-in lib for web requesting
Import
from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.error import HTTPError
Open url
page = urlopen(URL)
Requests
HTTP for human
Import
import requests
get/post
r = requests.get(URL)
r = requests.post(URL)
Add Headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Accept': 'text/html, application/xhtml+xml, application/xml; q=0.9, img/webp,*/*; q=0.8',
'Host': 'www.zhihu.com',
'Referer': 'https://www.zhihu.com/'}
r = requests.get(URL, headers=headers)
Add cookies
cookies = dict(cookies_are='working')
r = requests.get(URL, cookies=cookies)
# using cookie jar
jar = requests.cookies.RequestsCookieJar()
jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')
jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')
r = requests.get(url, cookies=jar)
Check results
r.status_code
r.headers
r.text
Build a session
s = requests.session(URL)
Post with string data
data = "some string"
r = requests.post(URL, data=data)
Post with json data
import json
data = json.dumps({key1:item1, key2:item2})
r = requests.post(URL, data=data)
Post with multi-part files
files = {"files":("test.pdf", open("test.pdf", "rb"),"application/pdf"),
"info":(None, "information")}
Download file
report = session.get(url, stream=True)
with open("report.pdf", "wb") as f:
for chunk in report.iter_content(chunk_size=128):
f.write(chunk)
Use proxy
proxies = {'http':'http://proxy:port',
'https':'https://proxy:port'
}
r = requests.get(URL, proxies = proxies)
BeautifulSoup 4
page parser
Import
from bs4 import BeautifulSoup
Parser page
bsObj = BeautifulSoup(page.read(), 'html.parser')
Find element
targets = bsObj.findAll("span", {"class":"a_cover"})
lxml
page parser with xpath
Import
from lxml import etree
Parser
page = etree.HTML(response.text)
Find elements using xpath
# select all subelements beneath the current node ("//") with the given tag ("span") which the given attribute ("[@class]") has the given value ("className")
elems = page.xpath('//span[@class="className"]')
# select all elements that has a child element named tag
elems = page.xpath('[tag]')
for elem in elems:
# get the tag, text, and attribute of an element
print(item.tag, item.text, item.get('attrib'))
# get the text directly
elems = page.xpath('//span[@class="className"]/text()')
# position
elems = page.xpath('//span[location()=1]')
Selenium
fake browser
安装
pip install selenium
# linux 需要安装chromium driver
sudo apt-get install chromium-chromedriver
下载chrome驱动
https://chromedriver.chromium.org/getting-started
使用
Import
from selenium import webdriver
Initialize
# phantomjs 在 selenium 4.0 中被废弃,替代方法是使用chrome/firefox的headless模式
driver = webdriver.PhantomJS(executable_path=path)
# headless chrome
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
Get page
driver.get(URL)
Set headers
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept-Language'] = 'en-US,en;q=0.6'
处理弹窗
al = driver.switch_to.alert
al.accept()
筛选元素
from selenium.webdriver.common.by import By
driver.find_elements(by=By.CLASS_NAME, value="cell-type")
driver.find_element(By.XPATH, '//button[text()="Some text"]')
gevent
协程爬取
import
import gevent
import gevent.queue
import gevent.pool
import gevent.monkey
Overwrite incompatible functions built-in Python
gevent.monkey.patch_all()
Built a pool with assigned threads
gevent_pool = gevent.pool.Pool(20)
Map function to each thread
threads = []
threads.append(gevent.spawn(function, *args))
gevent.joinall(threads)
Other
Wait random time
import random
import time
time.sleep(random.random())
Encode string into urlcode
from urllib.parse import quote
s = quote(s)
from urllib.parse import quote