之前要用到海量商标数据,看到中知慧智的数据比较全,于是想爬取,结果发现他的数据也是来自于其母公司知产出版社的,爬个几万就废了。fromseleniumimportwebdriverfromselenium.webdriver.common.byimp...
之前要用到海量商标数据,看到中知慧智的数据比较全,于是想爬取,结果发现他的数据也是来自于其母公司知产出版社的,爬个几万就废了。
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.action_chains import ActionChains from pyquery import PyQuery as pq import pymysql import time import json driver=webdriver.Firefox() db = pymysql.connect("127.0.0.1","root","google","sbgg",charset = 'utf8') driver.get("http://s.ip-cld.com/search/trademark/expression") number=128909 end=1200000 def parse_html(source): doc = pq(source) #print("图片地址",doc('#detailContent > div.column.clearfix > ul.footea > li:nth-child(1) > img').attr('src')) #print("商标名称",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(1) > span').text()) #print("指定颜色",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text()) #print("注册号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(2) > span').text()) #print("是否共有商标",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text()) #print("注册日期",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(3) > span').text()) #print("初审公告期号",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(3) > span').text()) #print("申请号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(4) > span').text()) #print("初审公告日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(4) > span').text()) #print("申请日期",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(5) > span').text()) #print("注册公告期号",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(5) > span').text()) #print("商标类型",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(6) > span').text()) #print("优先权日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(6) > span').text()) #print("商标状态",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(7) > span').text()) #print("国际注册日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(7) > span').text()) #print("尼斯分类",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(8) > span').text()) #print("后期指定日期",doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(8) > span').text()) #print("类似群号",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(9) > span').text()) #print("专用权期限",doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(10) > span').text()) #print("申请人名称",doc('#detailContent > div.line > ul > li:nth-child(1) > span').text()) #print("申请人地址",doc('#detailContent > div.line > ul > li:nth-child(2) > span').text()) #print("代理人名称",doc('#detailContent > div:nth-child(5) > ul > li > span').text()) #print("商品/服务列表",doc('#detailContent > div.service > span').text()) #print("商标公告状态",doc('#detailContent > div:nth-child(8) > table').html()) #print("商标流程",doc('#proTab').html()) #print("转让流程",doc('#detailContent > div.process > table').html()) picurl=doc('#detailContent > div.column.clearfix > ul.footea > li:nth-child(1) > img').attr('src'), tmname=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(1) > span').text(), color=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text(), reg_num=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(2) > span').text(), share=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(2) > span').text(), reg_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(3) > span').text(), first_ann_num=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(3) > span').text(), apply_num=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(4) > span').text(), first_ann_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(4) > span').text(), apply_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(5) > span').text(), reg_ann_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(5) > span').text(), tm_type=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(6) > span').text(), priority=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(6) > span').text(), #优先权 tm_status=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(7) > span').text(), international_reg_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(7) > span').text(), nice_class=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(8) > span').text(), houzhiding_date=doc('#detailContent > div.column.clearfix > ul:nth-child(4) > li:nth-child(8) > span').text(), xiangshiqun=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(9) > span').text(), zhuanyongquan_date=doc('#detailContent > div.column.clearfix > ul:nth-child(3) > li:nth-child(10) > span').text(), reg_user=doc('#detailContent > div.line > ul > li:nth-child(1) > span').text(), reg_address=doc('#detailContent > div.line > ul > li:nth-child(2) > span').text(), agent_user=doc('#detailContent > div:nth-child(5) > ul > li > span').text(), service_list=doc('#detailContent > div.service > span').text(), tm_ann_status=doc('#detailContent > div:nth-child(8) > table').html(), tm_process=doc('#proTab').html(), transfe_process=doc('#detailContent > div.process > table').html() data={'reg_num':reg_num,'tmname':tmname,'picurl':picurl,'color':color,'share':share,'reg_date':reg_date,'first_ann_num':first_ann_num,'apply_num':apply_num,'first_ann_date':first_ann_date,'apply_date':apply_date,'reg_ann_date':reg_ann_date,'tm_type':tm_type,'priority':priority,'tm_status':tm_status,'international_reg_date':international_reg_date,'nice_class':nice_class,'houzhiding_date':houzhiding_date,'xiangshiqun':xiangshiqun,'zhuanyongquan_date':zhuanyongquan_date,'reg_user':reg_user,'reg_address':reg_address,'agent_user':agent_user,'service_list':service_list,'tm_ann_status':tm_ann_status,'tm_process':tm_process,'transfe_process':transfe_process} data=json.dumps(data,ensure_ascii=False) save_to_mysql(reg_num,data) #filename = str(number)+".txt" #with open(filename,'w',encoding='utf-8') as f: # f.write(data) def save_to_mysql(reg_num,data): #sql="insert into zzhz(picurl,tmname,color,reg_num,share,reg_date,first_ann_num,apply_num,first_ann_date,apply_date,reg_ann_date,tm_type,priority,tm_status,international_reg_date,nice_class,houzhiding_date,xiangshiqun,zhuanyongquan_date,reg_user,reg_address,agent_user,service_list,tm_ann_status,tm_process,transfe_process) values("+picurl+","+tmname+","+color+","+reg_num+","+share+","+reg_date+","+first_ann_num+","+apply_num+","+first_ann_date+","+apply_date+","+reg_ann_date+","+tm_type+","+priority+","+tm_status+","+international_reg_date+","+nice_class+","+houzhiding_date+","+xiangshiqun+","+zhuanyongquan_date+","+reg_user+","+reg_address+","+agent_user+","+service_list+","+tm_ann_status+","+tm_process+","+transfe_process+")" sql="insert into detail(detail) values('"+data+"')" try: print(str(reg_num)+"Saved To Database!") cursor = db.cursor() cursor.execute(sql) db.commit() except: db.rollback() while(True): if(number==end): exit() keyword="注册号 = ("+str(number)+")" driver.find_element_by_id('selfeditor').clear() driver.find_element_by_id('selfeditor').send_keys(keyword) driver.find_element_by_id("gosearch").click() windows = driver.current_window_handle all_handles = driver.window_handles #print(all_handles) driver.switch_to.window(all_handles[1]) #切换到列表结果页 try: option=WebDriverWait(driver,10,0.1).until(EC.presence_of_element_located((By.CLASS_NAME, 'jsjg'))) try: driver.find_element_by_link_text("商标详情").click() time.sleep(1) driver.close() except Exception as err: print(str(number)+" Not Found Or Search Error!") driver.close() driver.switch_to.window(all_handles[0]) number=number+1 continue except Exception as err: print(str(number)+" Page Loading Timeout!") driver.close() driver.switch_to.window(all_handles[0]) number=number+1 continue all_handles = driver.window_handles #print(all_handles) driver.switch_to.window(all_handles[1]) #切换到详情标签 source=driver.page_source #详情数据 parse_html(source) #解析数据 driver.close() driver.switch_to.window(all_handles[0]) #切换到主查询标签 number=number+1
全文详见:http://xpxw.com/?id=79