Notepad++Good Luck To You!

中知慧智爬虫
浏览: 3869    评论: 0

之前要用到海量商标数据,看到中知慧智的数据比较全,于是想爬取,结果发现他的数据也是来自于其母公司知产出版社的,爬个几万就废了。fromseleniumimportwebdriverfromselenium.webdriver.common.byimp...


之前要用到海量商标数据,看到中知慧智的数据比较全,于是想爬取,结果发现他的数据也是来自于其母公司知产出版社的,爬个几万就废了。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from pyquery import PyQuery as pq
import pymysql
import time
import json
driver=webdriver.Firefox()

db = pymysql.connect("127.0.0.1","root","google","sbgg",charset = 'utf8')

driver.get("http://s.ip-cld.com/search/trademark/expression")
number=128909
end=1200000

def parse_html(source):
	doc = pq(source)
	#print("图片地址",doc('#detailContent > div.column.clearfix > ul.footea > li:nth-child(1) > img').attr('src'))
	#print("商标名称",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(1) > span').text())
	#print("指定颜色",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text())
	#print("注册号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(2) > span').text())
	#print("是否共有商标",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text())
	#print("注册日期",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(3) > span').text())
	#print("初审公告期号",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(3) > span').text())
	#print("申请号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(4) > span').text())
	#print("初审公告日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(4) > span').text())
	#print("申请日期",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(5) > span').text())
	#print("注册公告期号",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(5) > span').text())
	#print("商标类型",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(6) > span').text())
	#print("优先权日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(6) > span').text())
	#print("商标状态",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(7) > span').text())
	#print("国际注册日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(7) > span').text())
	#print("尼斯分类",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(8) > span').text())
	#print("后期指定日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(8) > span').text())
	#print("类似群号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(9) > span').text())
	#print("专用权期限",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(10) > span').text())
	#print("申请人名称",doc('#detailContent > div.line > ul > li:nth-child(1) > span').text())
	#print("申请人地址",doc('#detailContent > div.line > ul > li:nth-child(2) > span').text())
	#print("代理人名称",doc('#detailContent > div:nth-child(5) > ul > li > span').text())
	#print("商品/服务列表",doc('#detailContent > div.service > span').text())
	#print("商标公告状态",doc('#detailContent > div:nth-child(8) > table').html())
	#print("商标流程",doc('#proTab').html())
	#print("转让流程",doc('#detailContent > div.process > table').html())
	
	picurl=doc('#detailContent > div.column.clearfix > ul.footea > li:nth-child(1) > img').attr('src'),
	tmname=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(1) > span').text(),
	color=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text(),
	reg_num=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(2) > span').text(),
	share=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text(),
	reg_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(3) > span').text(),
	first_ann_num=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(3) > span').text(),
	apply_num=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(4) > span').text(),
	first_ann_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(4) > span').text(),
	apply_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(5) > span').text(),
	reg_ann_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(5) > span').text(),
	tm_type=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(6) > span').text(),
	priority=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(6) > span').text(), #优先权
	tm_status=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(7) > span').text(),
	international_reg_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(7) > span').text(),
	nice_class=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(8) > span').text(),
	houzhiding_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(8) > span').text(),
	xiangshiqun=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(9) > span').text(),
	zhuanyongquan_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(10) > span').text(),
	reg_user=doc('#detailContent > div.line > ul > li:nth-child(1) > span').text(),
	reg_address=doc('#detailContent > div.line > ul > li:nth-child(2) > span').text(),
	agent_user=doc('#detailContent > div:nth-child(5) > ul > li > span').text(),
	service_list=doc('#detailContent > div.service > span').text(),
	tm_ann_status=doc('#detailContent > div:nth-child(8) > table').html(),
	tm_process=doc('#proTab').html(),
	transfe_process=doc('#detailContent > div.process > table').html()
	
	data={'reg_num':reg_num,'tmname':tmname,'picurl':picurl,'color':color,'share':share,'reg_date':reg_date,'first_ann_num':first_ann_num,'apply_num':apply_num,'first_ann_date':first_ann_date,'apply_date':apply_date,'reg_ann_date':reg_ann_date,'tm_type':tm_type,'priority':priority,'tm_status':tm_status,'international_reg_date':international_reg_date,'nice_class':nice_class,'houzhiding_date':houzhiding_date,'xiangshiqun':xiangshiqun,'zhuanyongquan_date':zhuanyongquan_date,'reg_user':reg_user,'reg_address':reg_address,'agent_user':agent_user,'service_list':service_list,'tm_ann_status':tm_ann_status,'tm_process':tm_process,'transfe_process':transfe_process}
	
	
	data=json.dumps(data,ensure_ascii=False)
	
	save_to_mysql(reg_num,data)
	
	#filename = str(number)+".txt"
	#with open(filename,'w',encoding='utf-8') as f:
	#	f.write(data)	
		
def save_to_mysql(reg_num,data):
	#sql="insert into zzhz(picurl,tmname,color,reg_num,share,reg_date,first_ann_num,apply_num,first_ann_date,apply_date,reg_ann_date,tm_type,priority,tm_status,international_reg_date,nice_class,houzhiding_date,xiangshiqun,zhuanyongquan_date,reg_user,reg_address,agent_user,service_list,tm_ann_status,tm_process,transfe_process) values("+picurl+","+tmname+","+color+","+reg_num+","+share+","+reg_date+","+first_ann_num+","+apply_num+","+first_ann_date+","+apply_date+","+reg_ann_date+","+tm_type+","+priority+","+tm_status+","+international_reg_date+","+nice_class+","+houzhiding_date+","+xiangshiqun+","+zhuanyongquan_date+","+reg_user+","+reg_address+","+agent_user+","+service_list+","+tm_ann_status+","+tm_process+","+transfe_process+")"
	
	sql="insert into detail(detail) values('"+data+"')"
	try:
		print(str(reg_num)+"Saved To Database!")
		cursor = db.cursor()
		cursor.execute(sql)
		db.commit()
	except:
		db.rollback()

while(True):
	if(number==end):
		exit()
	keyword="注册号 = ("+str(number)+")"
	driver.find_element_by_id('selfeditor').clear()
	driver.find_element_by_id('selfeditor').send_keys(keyword)
	driver.find_element_by_id("gosearch").click()
	
	windows = driver.current_window_handle
	all_handles = driver.window_handles
	#print(all_handles)
	driver.switch_to.window(all_handles[1]) #切换到列表结果页
		
	try:
		option=WebDriverWait(driver,10,0.1).until(EC.presence_of_element_located((By.CLASS_NAME, 'jsjg')))
		try:
			driver.find_element_by_link_text("商标详情").click()
			time.sleep(1)
			driver.close()
		except Exception as err:
			print(str(number)+" Not Found Or Search Error!")
			driver.close()
			driver.switch_to.window(all_handles[0])
			number=number+1	
			continue
	except Exception as err:
		print(str(number)+" Page Loading Timeout!")
		driver.close()
		driver.switch_to.window(all_handles[0])
		number=number+1	
		continue
	
	all_handles = driver.window_handles
	#print(all_handles)
	driver.switch_to.window(all_handles[1]) #切换到详情标签
	
	source=driver.page_source #详情数据
	parse_html(source) #解析数据	
	driver.close()
			
	driver.switch_to.window(all_handles[0]) #切换到主查询标签
	number=number+1
	


全文详见:http://xpxw.com/?id=79

TOP


«    2024年10月    »
123456
78910111213
14151617181920
21222324252627
28293031
TOP 搜索
TOP 控制面板
您好,欢迎到访网站!
  查看权限
TOP 最新留言
    TOP 作者列表
    TOP 站点信息
    • 文章总数:163
    • 页面总数:0
    • 分类总数:6
    • 标签总数:20
    • 评论总数:0
    • 浏览总数:361552