(博主亲自录制视频)
# -*- coding: utf-8 -*-"""Created on Tue May 17 16:26:31 2016采集下来excel文件小于2kb的有问题@author: Administrator""" import requests,bs4,csv,time,random,os,threading #一次采集个数divident=20 #存放所有二级网址fileName='combinedFile.csv'#存放二级网址目录bad_urls=[]site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97"site_guangdong="http://china.guidechem.com/suppliers/list_catid-21_area-广东"site_shanghai="http://china.guidechem.com/suppliers/list_catid-21_area-%E4%B8%8A%E6%B5%B7"site_shanxi="http://china.guidechem.com/suppliers/list_catid-21_area-陕西"site_chongqing="http://china.guidechem.com/suppliers/list_catid-21_area-重庆"site_jiangsu="http://china.guidechem.com/suppliers/list_catid-21_area-江苏"pages_hubei=31pages_guangdong=21pages_shanghai=34pages_shanxi=15pages_chongqing=2pages_jiangsu=67start_page=0 def Get_sites(site,pages): list_pages=[] for page in range(1,pages+1): thePage=site+"-"+"p"+str(page)+".html" list_pages.append(thePage) return list_pagesdef Get_company_name(elems,i): elems_company_name=elems[i].select(".dblue") if len(elems_company_name)==0: #如果找不到元素,则空起 company_name="" return company_name company_name=elems_company_name[0].text return company_namedef Get_main_product(elems,i): elems_main_product=elems[i].select("li") if len(elems_main_product)==0: #如果找不到元素,则空起 main_product="" return main_product main_product=elems_main_product[1].text.strip("\r\n") return main_product def Get_phone_address(elems,i): elems_contact=elems[i].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip("\r\n\r\n\t\r\n") content_contact2=content_contact1.strip("\r\n") list_content_contact=content_contact2.split("\r\n\r\n") #有时候信息会缺失,用正则表达式筛选text内容 if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1] if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone="" if "电话" in content: phone=content address="" phone_address=(phone,address) return phone_address #获取每一页20个公司信息存储在list_rows_informationdef Get_page_information(url): #每一页20个公司信息存储在list_rows_information里面 list_rows_information=[] res=requests.get(url) #time.sleep(2) soup=bs4.BeautifulSoup(res.text,"lxml") #time.sleep(2) #综合信息 elems=soup.select(".clist_list_content_r") num=len(elems) for i in range(num): try: #公司名称 company_name=Get_company_name(elems,i) #主要产品 main_product=Get_main_product(elems,i) #联系方式 phone_address=Get_phone_address(elems,i) phone=phone_address[0] address=phone_address[1] list_rows_information.append([company_name,main_product,phone,address]) except: print("error at:",i) continue return list_rows_information #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]def Write_table_to_csv(url): list_tableContent=Get_page_information(url) fileName=os.path.splitext(url)[0][-3:]+".csv" # 1.csv ''' fileName=os.path.splitext(url)[0][-3:]+".csv" fileName Out[27]: 'p12.csv' ''' #对列表格式修改,字符串写入的格式不对 file=open(fileName,'w',newline='') writer1=csv.writer(file) writer1.writerows(list_tableContent) file.close() #写入所有文件def Write_allTables_to_csvs(list_pages): for i in range(start_page,pages_shanghai): try: Write_table_to_csv(i) time.sleep(random.randint(30,31)) except: print("error at:",i) continue #step表示要采集多少次;divident表示每次采集几个def Step(urls_list,divident): step=len(urls_list)/divident step=int(step) return step #获取采集网页余数def Left(urls_list): step=Step(urls_list,divident) left=len(urls_list)-step*divident return left #采集某范围网址的公司数据def download_range(start,end): urls_list_range1=list_pages[start:end] for url in urls_list_range1: try: Write_table_to_csv(url) except: bad_urls.append(url) continue #print("well Done") #主函数list_pages=Get_sites(site_jiangsu,pages_jiangsu)#采集要多少线程step=Step(list_pages,divident)#网页余数left=Left(list_pages)#生产所有csv文件,单线程采集 #Write_allTables_to_csvs(list_pages)downloadThreads = [] # a list of all the Thread objectsfor i in range(0, len(list_pages), step): # loops 14 times, creates 14 threads downloadThread = threading.Thread(target=download_range, args=(i, i +step)) downloadThreads.append(downloadThread) downloadThread.start()#采集余数download_range(-left,0) # Wait for all threads to end.for downloadThread in downloadThreads: downloadThread.join()print('Done.')'''测试#downloadThread = threading.Thread(target=download_range, args=(10, 12))#downloadThread.start()downloadThread = threading.Thread(target=download_range, args=(12, 14))downloadThread.start()downloadThread = threading.Thread(target=download_range, args=(14, 16))downloadThread.start()i=3res=requests.get(list_pages[i])soup=bs4.BeautifulSoup(res.text,"lxml")elems=soup.select(".clist_list_content_r")#联系方式elems_contact=elems[2].select(".site_l")content_contact=elems_contact[0].textcontent_contact1=content_contact.strip("\r\n\r\n\t\r\n")content_contact2=content_contact1.strip("\r\n")list_content_contact=content_contact2.split("\r\n\r\n")#有时候信息会缺失,用正则表达式筛选text内容if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1]if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone=[] if "电话" in content: phone=content address=[]'''