Scraper

[TOC]

urllib

Python built-in lib for web requesting

Import

from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.error import HTTPError

Open url

page = urlopen(URL)

Requests

HTTP for human

Import

import requests

get/post

r = requests.get(URL)
r = requests.post(URL)

Add Headers

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Accept': 'text/html, application/xhtml+xml, application/xml; q=0.9, img/webp,*/*; q=0.8',
    'Host': 'www.zhihu.com',
    'Referer': 'https://www.zhihu.com/'}
r = requests.get(URL, headers=headers)

Add cookies

cookies = dict(cookies_are='working')
r = requests.get(URL, cookies=cookies)
# using cookie jar
jar = requests.cookies.RequestsCookieJar()
jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')
jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')
r = requests.get(url, cookies=jar)

Check results

r.status_code
r.headers
r.text

Build a session

s = requests.session(URL)

Post with string data

data = "some string"
r = requests.post(URL, data=data)

Post with json data

import json
data = json.dumps({key1:item1, key2:item2})
r = requests.post(URL, data=data)

Post with multi-part files

files = {"files":("test.pdf", open("test.pdf", "rb"),"application/pdf"), 
        "info":(None, "information")}

Download file

report = session.get(url, stream=True)
with open("report.pdf", "wb") as f:
    for chunk in report.iter_content(chunk_size=128):
        f.write(chunk)

Use proxy

proxies = {'http':'http://proxy:port',
    'https':'https://proxy:port'
    }
r = requests.get(URL, proxies = proxies)

BeautifulSoup 4

page parser

Import

from bs4 import BeautifulSoup

Parser page

bsObj = BeautifulSoup(page.read(), 'html.parser')

Find element

targets = bsObj.findAll("span", {"class":"a_cover"})

lxml

page parser with xpath

Import

from lxml import etree

Parser

page = etree.HTML(response.text)

Find elements using xpath

# select all subelements beneath the current node ("//") with the given tag ("span") which the given attribute ("[@class]") has the given value ("className")
elems = page.xpath('//span[@class="className"]')
# select all elements that has a child element named tag
elems = page.xpath('[tag]')
for elem in elems:
    # get the tag, text, and attribute of an element
    print(item.tag, item.text, item.get('attrib'))
# get the text directly
elems = page.xpath('//span[@class="className"]/text()')
# position
elems = page.xpath('//span[location()=1]')

Selenium

fake browser

安装

pip install selenium

# linux 需要安装chromium driver
sudo apt-get install chromium-chromedriver

下载chrome驱动

https://chromedriver.chromium.org/getting-started

使用

Import

from selenium import webdriver

Initialize

# phantomjs 在 selenium 4.0 中被废弃,替代方法是使用chrome/firefox的headless模式
driver = webdriver.PhantomJS(executable_path=path)

# headless chrome
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)

Get page

driver.get(URL)

Set headers

webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept-Language'] = 'en-US,en;q=0.6'

处理弹窗

al = driver.switch_to.alert
al.accept()

筛选元素

from selenium.webdriver.common.by import By

driver.find_elements(by=By.CLASS_NAME, value="cell-type")
driver.find_element(By.XPATH, '//button[text()="Some text"]')

gevent

协程爬取

import

import gevent
import gevent.queue
import gevent.pool
import gevent.monkey

Overwrite incompatible functions built-in Python

gevent.monkey.patch_all()

Built a pool with assigned threads

gevent_pool = gevent.pool.Pool(20)

Map function to each thread

threads = []
threads.append(gevent.spawn(function, *args))
gevent.joinall(threads)

Other

Wait random time

import random
import time
time.sleep(random.random())

Encode string into urlcode

from urllib.parse import quote
s = quote(s)
from urllib.parse import quote