13. 网络爬虫之实战

发表于 2018-06-27 更新于 2022-11-05 分类于语言， Python

了解正则表达式

正则 1

正则 2

RE 库主要功能函数

经典的正则表达式

# 包含字母的字符串
[A-Za-z]+

# 包含字母或数字的字符串
[A-Za-z0-9]+

# 整数形式的字符串
^-?\d+$

# 正整数形式的字符串0
[1-9]\d*

# 邮政编码
[1-9]\d{5}

# 匹配中文字符
[\u4e00‐\u9fa5]

国内电话号码，010‐68913536
\d{3}-\d{8}|\d{4-\d{7}}

# 匹配 IP 地址 表现形式为四位 0~255 而已
(([1‐9]?\d|1\d{2}|2[0‐4]\d|25[0‐5]).){3}([1‐9]?\d|1\d{2}|2[0‐4]\d|25[0‐5])

raw string 类型（原生字符串类型）

re库采用raw string类型表示正则表达式，表示为：r’text’.
raw string是不包含对转义符再次转义的字符串.
建议：当正则表达式包含转义符时，使用raw string

re.search(pattern, string, flags=0)

∙ pattern : 正则表达式的字符串或原生字符串表示
∙ string : 待匹配字符串
∙ flags : 正则表达式使用时的控制标记

re.split(pattern, string, maxsplit=0, flags=0)

∙ pattern : 正则表达式的字符串或原生字符串表示
∙ string : 待匹配字符串
∙ maxsplit: 最大分割数，剩余部分作为最后一个元素输出
∙ flags : 正则表达式使用时的控制标记

re.sub(pattern, repl, string, count=0, flags=0)

∙ pattern : 正则表达式的字符串或原生字符串表示
∙ repl : 替换匹配字符串的字符串
∙ string : 待匹配字符串
∙ count : 匹配的最大替换次数
∙ flags : 正则表达式使用时的控制标记

RE 库的另一种等价用法

面向对象用法:

1 2	pat = re.compile(r'[0-9]\d{5}') match = pat.search("祁阳陶铸路邮编:426100")

Match 对象的属性

Match1

Match2

贪婪匹配和最小匹配

贪婪匹配和最小匹配1

贪婪匹配和最小匹配2

最小匹配操作符

只要长度输出可能不同的，都可以通过在操作符后增加?变成最小匹配

股票数据定向爬虫

import re
import requests
import bs4
import traceback
from bs4 import BeautifulSoup

key_code = '股票代码'
key_name = '股票名称'
key_jin = '今开'
key_cheng = '成交量'
chinese_blank = '　'
tplt  = "{:<12}\t{:" + chinese_blank +"<7}\t{:<8}\t{:<8}"

def getHTMLText(url, encoding="UTF-8"):
    try:
        # 模拟浏览器请求头
        kv = {'User-Agent':'Mozilla/5.0 Chrome/63.0.3239.132'}
        response = requests.get(url, timeout = 12, headers = kv)
        response.raise_for_status()
        # 编码识别的优化
        response.encoding = encoding
        return response.text
    except:
        return "产生HTTPError"

def printStock(strockItemDict):
    print(tplt.format(strockItemDict[key_code], strockItemDict[key_name], strockItemDict[key_jin],\
                      strockItemDict[key_cheng]))

def getStockList(stockList, stockURL):
    htmlText= getHTMLText(stockURL, "gb2312")
    document = BeautifulSoup(htmlText, "html.parser")
    pattern = re.compile(r's[hz]\d{6}')
    for item in document('a', attrs={"target": "_blank"}):
        try:
            href = item.attrs['href']
            stockList.append(pattern.search(href).group(0))
        except:
            continue

def getStockInfo(stockList, stockURL, filePath):
    # 已休市 sh201004
    # 找不到404 sh203007
    # 正常股票 sh600031
    # stockList = ["sh600031", 'sz300213', 'sz000157', 'sz000425', 'sh600380', 'sz300142', 'sh600161']
    print(tplt.format(key_code, key_name, key_jin, key_cheng))
    for stockCode in stockList:
        htmlText= getHTMLText(stockURL+stockCode+".html")
        document = BeautifulSoup(htmlText, "html.parser")
        if document == "":
            continue
        try:
            stockInfo = document.find('div', attrs={'class':'stock-bets'})
            if (stockInfo is None):
                # 如果页面报错404则跳过
                tag = document.find('div', class_="error-page")
                if (tag is not None):
                   tag =  tag.h2
                   if (tag is not None):
                       if (tag.string == "404"):
                           continue
            betsNameDiv = stockInfo.find(attrs={'class': 'bets-name'})
            betsNameDivText = betsNameDiv.text
            infoDict = {key_code: stockCode, key_name:  betsNameDivText.split()[0], key_jin: "--", key_cheng: "--"}
            # 获取其他属性
            keyList = stockInfo.find_all('dt')
            valueList = stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key = keyList[i].text
                if (key == key_jin):
                    infoDict[key_jin] = valueList[i].text
                elif (key == key_cheng):
                    infoDict[key_cheng] = valueList[i].text
            printStock(infoDict)
        except:
            traceback.print_exc()
            continue

def main():
   stockURL = 'http://quote.eastmoney.com/stocklist.html'
   stockInfoURL = 'https://gupiao.baidu.com/stock/'
   outputFile = 'D://likai.txt'
   stockList = list()
   getStockList(stockList, stockURL)
   getStockInfo(stockList, stockInfoURL, outputFile)

main()

最终抓取效果