Python爬⾍获取企查查公开的企业信息
1、参考博⽂
2、python代码
#!/usr/bin/python3
#-*- coding: utf-8 -*-
quest
import re
import pymysql
#记录公司信息的字典,类似C语⾔的结构体
#字典中的字段包括:company,domain,legal_person,address,email,phone
gCompanyInfo = dict()
#伪装爬⾍成浏览器
def spider2browser():
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
opener = quest.build_opener()
opener.addheaders = [headers]
#end of spider2browser
#处理企业信息的异常字符,如换⾏符、空格、查不到等等
def getTargetInfo(result):
if 0 == len(result):
return '暂⽆'
else:
info = re.sub(r'\n', "", str(result[0])) #删除换⾏符
info = re.sub(r'<br/>', "", info) #删除多余标签
info = re.sub(r' ', "", info) # 删除多余空格
return info
#end of getTargetInfo()
#根据公司名字获取企业详细信息
def getCompanyInfo(chinaCompany):
companyInfo = {'company':chinaCompany, 'legal_person':'暂⽆', 'domain':'暂⽆', 'address':'暂⽆'}
#转化为机器可以识别带中⽂的⽹址,编码类型为unicode。只转换汉字部分,不能全部⽹址进⾏转换
company=urllib.parse.quote(chinaCompany)
firstUrl="www.qichacha/search?key=" + company
#print("visit web:"+testUrl)
#获取法⼈代表的名字
quest.urlopen(firstUrl).read().decode("utf-8", "ignore")
matchPat='法定代表⼈.*?>(.*?)</a>'
bosses = repile(matchPat, re.S).findall(searchRet)
companyInfo['legal_person'] = getTargetInfo(bosses)
#爬取第⼀个页⾯,即搜索企业名字,获得访问企业信息的跳转链接
matchPat='addSearchIndex.*?href="(.*?)" target="_blank" class="ma_h1"'
nextUrls = repile(matchPat, re.S).findall(searchRet)
if 0 == len(nextUrls):
if 0 == len(nextUrls):
return companyInfo
nextUrl = "www.qichacha" + str(nextUrls[0])
#爬取第⼆个页⾯,获取公司官⽹
quest.urlopen(nextUrl).read().decode("utf-8", "ignore")
matchPat = 'data-delay="500" rel="nofollow">(.*?)</a> <a onclick'
企业信息搜索软件urls=repile(matchPat, re.S).findall(searchRet)
companyInfo['domain'] = getTargetInfo(urls)
#获取
matchPat='title="查看地址".*?>(.*?)</a>'
addresses=repile(matchPat, re.S).findall(searchRet)
companyInfo['address'] =getTargetInfo(addresses)
return companyInfo
#end of getCompanyInfo()
#将公司的详细信息写⼊数据库
def writeInfoToDB(cursor, companyInfo):
sql = "insert into company_info(company,domain,legal_person,address) values(%s, %s, %s, %s)"
val = (companyInfo['company'], companyInfo['domain'], companyInfo['legal_person'],companyInfo['address']) try:
dbmit()
print("Info: 记录公司 %s 成功" % companyInfo['company'])
except Exception as err:
print("Error: 记录公司 %s 失败" % companyInfo['company'])
print(err)
#end of writeInfoToDB()
#=========主流程从这⾥开始=========#
#从数据库中将所有的企业名单读取出来
db = t("x","数据库⽤户名","密码","数据库")
cursor = db.cursor()
companyList = cursor.fetchall()
#伪装爬⾍为浏览器
spider2browser()
#将所有公司的信息从企查查⽹址获取,并保存到数据库中
for row in companyList:
companyInfo = getCompanyInfo(row[0])
writeInfoToDB(cursor, companyInfo)
3、数据库结果
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论