明天的筆記本‧Tomorrow notebook

python之pyQuery

初始化

from pyquery import PyQuery as pq

# 以字符串初始化
html = 'html string...'
j = pq(html)

# 以文件初始化
j = pq(filename='XXX.html')

# 以url初始化
url = 'https://tw.news.yahoo.com/'
j = pq(url=url,encoding='utf-8')
print(j('title'))

#送出headers
j = pq('http://xxx.com/', headers={'user-agent': 'pyquery'})

#以post方式送出
j = pq('http://xxx.com/post', {'foo': 'bar'}, method='post', verify=True)

#修復不完整html標籤
from lxml import etree 
j = pq(etree.fromstring("<html></html>")))

pq參數可以直接傳入HTML代碼，j現在就相當於jQuery裡面的$符號了。

實例：爬取奇摩新聞首頁的所有連結及內文

import requests #HTTP請求庫
from pyquery import PyQuery as pq

#使用者header
from fake_useragent import UserAgent
ua = UserAgent()#使用者header

# PYQUERY 以url初始化
url = 'https://tw.news.yahoo.com/'

#取得隨機header
user_agent = {'User-Agent':str(ua.random)}

# 目標網頁
r = requests.get(url,headers=user_agent)
#r.encoding = 'gb18030'
r.encoding = 'utf8'

#print(r.text)

# pyQuery 以字符串初始化
j = pq(r.text,parser='html')
print(j('title'))

_a = j('a').attr('href')
print(_a)

#因為我們要分別取出每個a標籤的內容，所以要加上items方法方便遍歷
info = j('a').items()

#遍歷所有a標籤
for i in info:
    
    href = i.attr('href')#找出所有超連結
    text = i.text()#找出所有超連結文本
    
    print('標題:'+text)
    print('超連結:'+href)

其他綜合問題：


#解決中文亂碼
j = pq(url,encoding="gb2312")

#解析為html(pyquery默認解析為xmlns)
j = pq(url,parser="html")