之前要用到海量商标数据,看到中知慧智的数据比较全,于是想爬取,结果发现他的数据也是来自于其母公司知产出版社的,爬个几万就废了。fromseleniumimportwebdriverfromselenium.webdriver.common.byimp...
之前要用到海量商标数据,看到中知慧智的数据比较全,于是想爬取,结果发现他的数据也是来自于其母公司知产出版社的,爬个几万就废了。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from pyquery import PyQuery as pq
import pymysql
import time
import json
driver=webdriver.Firefox()
db = pymysql.connect("127.0.0.1","root","google","sbgg",charset = 'utf8')
driver.get("http://s.ip-cld.com/search/trademark/expression")
number=128909
end=1200000
def parse_html(source):
doc = pq(source)
#print("图片地址",doc('#detailContent > div.column.clearfix > ul.footea > li:nth-child(1) > img').attr('src'))
#print("商标名称",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(1) > span').text())
#print("指定颜色",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text())
#print("注册号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(2) > span').text())
#print("是否共有商标",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text())
#print("注册日期",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(3) > span').text())
#print("初审公告期号",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(3) > span').text())
#print("申请号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(4) > span').text())
#print("初审公告日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(4) > span').text())
#print("申请日期",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(5) > span').text())
#print("注册公告期号",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(5) > span').text())
#print("商标类型",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(6) > span').text())
#print("优先权日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(6) > span').text())
#print("商标状态",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(7) > span').text())
#print("国际注册日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(7) > span').text())
#print("尼斯分类",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(8) > span').text())
#print("后期指定日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(8) > span').text())
#print("类似群号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(9) > span').text())
#print("专用权期限",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(10) > span').text())
#print("申请人名称",doc('#detailContent > div.line > ul > li:nth-child(1) > span').text())
#print("申请人地址",doc('#detailContent > div.line > ul > li:nth-child(2) > span').text())
#print("代理人名称",doc('#detailContent > div:nth-child(5) > ul > li > span').text())
#print("商品/服务列表",doc('#detailContent > div.service > span').text())
#print("商标公告状态",doc('#detailContent > div:nth-child(8) > table').html())
#print("商标流程",doc('#proTab').html())
#print("转让流程",doc('#detailContent > div.process > table').html())
picurl=doc('#detailContent > div.column.clearfix > ul.footea > li:nth-child(1) > img').attr('src'),
tmname=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(1) > span').text(),
color=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text(),
reg_num=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(2) > span').text(),
share=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text(),
reg_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(3) > span').text(),
first_ann_num=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(3) > span').text(),
apply_num=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(4) > span').text(),
first_ann_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(4) > span').text(),
apply_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(5) > span').text(),
reg_ann_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(5) > span').text(),
tm_type=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(6) > span').text(),
priority=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(6) > span').text(), #优先权
tm_status=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(7) > span').text(),
international_reg_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(7) > span').text(),
nice_class=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(8) > span').text(),
houzhiding_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(8) > span').text(),
xiangshiqun=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(9) > span').text(),
zhuanyongquan_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(10) > span').text(),
reg_user=doc('#detailContent > div.line > ul > li:nth-child(1) > span').text(),
reg_address=doc('#detailContent > div.line > ul > li:nth-child(2) > span').text(),
agent_user=doc('#detailContent > div:nth-child(5) > ul > li > span').text(),
service_list=doc('#detailContent > div.service > span').text(),
tm_ann_status=doc('#detailContent > div:nth-child(8) > table').html(),
tm_process=doc('#proTab').html(),
transfe_process=doc('#detailContent > div.process > table').html()
data={'reg_num':reg_num,'tmname':tmname,'picurl':picurl,'color':color,'share':share,'reg_date':reg_date,'first_ann_num':first_ann_num,'apply_num':apply_num,'first_ann_date':first_ann_date,'apply_date':apply_date,'reg_ann_date':reg_ann_date,'tm_type':tm_type,'priority':priority,'tm_status':tm_status,'international_reg_date':international_reg_date,'nice_class':nice_class,'houzhiding_date':houzhiding_date,'xiangshiqun':xiangshiqun,'zhuanyongquan_date':zhuanyongquan_date,'reg_user':reg_user,'reg_address':reg_address,'agent_user':agent_user,'service_list':service_list,'tm_ann_status':tm_ann_status,'tm_process':tm_process,'transfe_process':transfe_process}
data=json.dumps(data,ensure_ascii=False)
save_to_mysql(reg_num,data)
#filename = str(number)+".txt"
#with open(filename,'w',encoding='utf-8') as f:
# f.write(data)
def save_to_mysql(reg_num,data):
#sql="insert into zzhz(picurl,tmname,color,reg_num,share,reg_date,first_ann_num,apply_num,first_ann_date,apply_date,reg_ann_date,tm_type,priority,tm_status,international_reg_date,nice_class,houzhiding_date,xiangshiqun,zhuanyongquan_date,reg_user,reg_address,agent_user,service_list,tm_ann_status,tm_process,transfe_process) values("+picurl+","+tmname+","+color+","+reg_num+","+share+","+reg_date+","+first_ann_num+","+apply_num+","+first_ann_date+","+apply_date+","+reg_ann_date+","+tm_type+","+priority+","+tm_status+","+international_reg_date+","+nice_class+","+houzhiding_date+","+xiangshiqun+","+zhuanyongquan_date+","+reg_user+","+reg_address+","+agent_user+","+service_list+","+tm_ann_status+","+tm_process+","+transfe_process+")"
sql="insert into detail(detail) values('"+data+"')"
try:
print(str(reg_num)+"Saved To Database!")
cursor = db.cursor()
cursor.execute(sql)
db.commit()
except:
db.rollback()
while(True):
if(number==end):
exit()
keyword="注册号 = ("+str(number)+")"
driver.find_element_by_id('selfeditor').clear()
driver.find_element_by_id('selfeditor').send_keys(keyword)
driver.find_element_by_id("gosearch").click()
windows = driver.current_window_handle
all_handles = driver.window_handles
#print(all_handles)
driver.switch_to.window(all_handles[1]) #切换到列表结果页
try:
option=WebDriverWait(driver,10,0.1).until(EC.presence_of_element_located((By.CLASS_NAME, 'jsjg')))
try:
driver.find_element_by_link_text("商标详情").click()
time.sleep(1)
driver.close()
except Exception as err:
print(str(number)+" Not Found Or Search Error!")
driver.close()
driver.switch_to.window(all_handles[0])
number=number+1
continue
except Exception as err:
print(str(number)+" Page Loading Timeout!")
driver.close()
driver.switch_to.window(all_handles[0])
number=number+1
continue
all_handles = driver.window_handles
#print(all_handles)
driver.switch_to.window(all_handles[1]) #切换到详情标签
source=driver.page_source #详情数据
parse_html(source) #解析数据
driver.close()
driver.switch_to.window(all_handles[0]) #切换到主查询标签
number=number+1
全文详见:http://xpxw.com/?id=79