-
大小: 11KB文件类型: .py金币: 1下载: 0 次发布日期: 2023-12-20
- 语言: Python
- 标签:
资源简介
https://github.com/helloMickey/project_previous/tree/master/judicial-data-analysis
爬取法律判决书的日期、年份、处理法院,并下载相应文书。
代码简单修改参数即可爬取不同的案件
代码片段和文件信息
# coding:utf-8
import socket
socket.setdefaulttimeout(60)
import requests
import urllib2
# import cchardet
import os time
from lxml import etree
import threading
import re
import random
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
# filenames=os.listdir(‘.‘)
# count=0
# for fname in filenames:
# if fname.startswith(‘gid_log‘):
# count+=1
# gid_path=‘gid_log_%d‘ %(count)
# 1、2步分开运行要注意gid_path
# gid_path=‘gid_log_12‘
def get_html(url): # 得到网页源码
headers = {
“Accept-Language“: “zh-CNzh;q=0.8“
“Accept-Encoding“: “gzip deflate sdch“
“Accept“: “text/htmlapplication/xhtml+xmlapplication/xml;q=0.9image/webp*/*;q=0.8“
“User-Agent“: “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/45.0.2454.101 Safari/537.36“
“Host“: “www.pkulaw.cn“
“Cookie“: “bdyh_record=1970324860086081%2C1970324860087844%2C1970324860087837%2C1970324860087907%2C1970324860085114%2C1970324860087657%2C1970324860087697%2C1970324860087631%2C1970324860087701%2C1970324860087851%2C1970324860086614%2C1970324860000764%2C1970324845231811%2C1970324860004991%2C1970324860002384%2C1970324845231794%2C1970324845231624%2C1970324860002207%2C1970324860046814%2C1970324860046704%2C; CheckIPAuto=0; CheckIPDate=2016-10-15 10:03:46; gm3jc5afyl35gm2yt55kc4m1isIPlogin=1; ASP.NET_SessionId=davttbjhikxhqyn1lj5alhsb; Hm_lvt_58c470ff9657d300e66c7f33590e53a8=1476497011147649834814764985281476499578; Hm_lpvt_58c470ff9657d300e66c7f33590e53a8=1476499578; Hm_lvt_8266968662c086f34b2a3e2ae9014bf8=1476497011147649834814764985281476499578; Hm_lpvt_8266968662c086f34b2a3e2ae9014bf8=1476499578; CookieId=gm3jc5afyl35gm2yt55kc4m1; FWinCookie=1“
“Upgrade-Insecure-Requests“: “1“
“Proxy-Connection“: “keep-alive“
}
html = requests.get(url headers=headers).text
return html
def write2file(content filename): # 将爬取的文书写入文件保存
try:
f = open(filename ‘w‘)
except Exception e:
filename = filename.split(u‘、‘)[0] + ‘_error_filename.txt‘
f = open(filename ‘w‘)
f.write(content.encode(‘utf-8‘))
f.close()
# 下载ihref对应的文书
def load_one_wenshu(gid title):
ex_href = ‘http://www.pkulaw.cn/case/FullText/_getFulltext?library=pfnl&gid=#gid#&loginSucc=0‘
href = ex_href.replace(‘#gid#‘ gid)
html = get_html(href)
page = etree.HTML(html)
content = page.xpath(‘body‘)[0].xpath(‘string(.)‘).strip()
write2file(content filepath + os.sep + title + ‘.txt‘)
def load_one_page_wenshu(gid_list titles): # 多线程抓取多个href的文书
# threads=[] # 尝试多线程加速 失败 访问频繁 出现验证码 封ip
# for i in range(len(gid_list)):
# gidtitle=gid_list[i]titles[i]
# threads.append(threading.Thread(target=load_one_wenshuargs=(gidtitle)))
# for t in threads:
# t.start()
# t.join() # 阻塞
for i in range(len(gid_list)): # 顺序爬取 时间过长 一个月大概需要20~30h
load_one_wenshu(
评论
共有 条评论