Crawling 하기
Crawling (자료수집)
파이썬 모듈 중 웹드라이브를 기반한 설레니움 라이브러리를 사용하여 자료수집을 한다.
rom selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import urllib.request
import os
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(chrome_options=options)
관심있는 데이터를 수집하기 위해 데이터구조(요즘 동물상 데이터 예시)를 먼저 정의한다.
keywordDict = {"쥐상1":"민경훈","쥐상2":"윤시윤","쥐상3":"이명박","쥐상4":"이특","쥐상5":"최유진","쥐상6":"하하",
"다람쥐상1":"사나","다람쥐상2":"웬디","다람쥐상3":"한승연",
"소상1":"문세윤","소상2":"문재인",
"호랑이상1":"김윤석","호랑이상2":"나나","호랑이상3":"노태우","호랑이상4":"백윤식",
"호랑이상5":"윤석열","호랑이상6":"이덕화","호랑이상7":"임재범","호랑이상8":"지호",
"호랑이상9":"천호진","호랑이상10":"최민식",
"토끼상1":"김슬기","토끼상2":"나연","토끼상3":"도티","토끼상4":"비니","토끼상5":"수지",
"토끼상6":"수빈","토끼상7":"수호","토끼상8":"아이린","토끼상9":"이나은","토끼상10":"장원영","토끼상11":"정국",
"토끼상12":"희진",
"용상1":"김수환","용상2":"김영아","용상3":"정주영",
"뱀상1":"김옥빈","뱀상2":"박정희","뱀상3":"청하","뱀상4":"헤이즈",
"말상1":"가희", "말상2":"강타", "말상3":"시원", "말상4":"유열", "말상5":"이수만",
"말상6":"이문세", "말상7":"제이홉",
"양상1":"유재석",
"원숭이상1":"박원순","원숭이상2":"박진영","원숭이상3":"빈지노","원숭이상4":"손민수",
"원숭이상5":"이재명","원숭이상6":"최진실","원숭이상7":"최진영","원숭이상8":"MC몽",
"강아지상1":"박보영","강아지상2":"구혜선","강아지상3":"문근영","강아지상4":"임수정",
"강아지상5":"한가인","강아지상6":"한효주","강아지상7":"송중기","강아지상8":"손예진",
"강아지상9":"송지효","강아지상10":"윤은혜","강아지상11":"장나라","강아지상12":"한지민",
"강아지상13":"설리","강아지상14":"김국진","강아지상15":"전현무","강아지상16":"신혜선",
"돼지상1":"강소라","돼지상2":"강호동","돼지상3":"고현정","돼지상4":"김준현","돼지상4":"박나래",
"돼지상5":"서현","돼지상6":"소유","돼지상7":"손나은","돼지상8":"솔비","돼지상9":"송가인","돼지상10":"옥주현",
"돼지상11":"이하늬","돼지상12":"정형돈","고양이상1":"이효리","고양이상2":"한예슬","고양이상3":"김희선",
"고양이상4":"한채영","고양이상5":"안소희","고양이상6":"유인영","고양이상7":"차예련","고양이상8":"박지윤",
"고양이상9":"김수현","고양이상10":"박경리","고양이상11":"정수정","고양이상12":"현아","고양이상13":"고소영",
"고양이상14":"신민아","고양이상15":"제시카","고양이상16":"제니",
"사자상":"황교안",
"여우상1":"가인","여우상2":"김현정","여우상3":"서예지","여우상4":"서인국","여우상5":"서지수","여우상6":
"신혜성","여우상7":"유인나","여우상8":"육성재","여우상9":"이준","여우상10":"이준기","여우상11":"정채연",
"여우상12":"지연","여우상13":"한혜진","여우상14":"키",
"너구리상1":"모모","너구리상2":"이낙연","너구리상3":"조정석",
"사슴상1":"성찬","사슴상2":"신비","사슴상3":"윤아","사슴상4":"전지현","사슴상5":"차은우","사슴상6":"최강창민",
"곰상1":"김태우","곰상2":"슬기","곰상3":"정준하","곰상4":"카이",
"늑대상1":"뷔","늑대상2":"세훈",
"거북이상1":"솔라","거북이상2":"예리","거북이상3":"이건희","거북이상4":"하연수",
"개구리상1":"노무현","개구리상2":"전소민","개구리상3":"조보아","개구리상4":"하현우",
"두꺼비상1":"주현",
"공룡상1":"공유","공룡상2":"김우빈","공룡상3":"종현","공룡상4":"첸",
"상어상1":"개코",
"복어상1":"김정은",
"멸치상1":"은혁",
"메기상1":"이상운","메기상2":"이상준"
}
데이터 구조를 만든 후 해당하는 폴더명을 만들어서 각각 분리한다.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import urllib.request
import os
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(chrome_options=options)
for name,keyword in keywordDict2.items():
foldername = name +"(" + keyword + ")"
myPath = "./"+ foldername + "/"
createFolder(foldername)
driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&ogbl")
elem = driver.find_element_by_name("q")
elem.send_keys(keyword)
elem.send_keys(Keys.RETURN)
time.sleep(1)
images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
count = 1
for image in images:
try:
image.click()
time.sleep(0.5)
url = driver.find_element_by_css_selector("#Sva75c > div > div > \
div.pxAole > div.tvh9oe.BIB1wf > c-wiz > div.OUZ5W > div.zjoqD > div > \
div.v4dQwb > a > img").get_attribute("src")
fullfilename = os.path.join(myPath, str(count) +'.jpg')
urllib.request.urlretrieve(url, fullfilename)
count = count + 1
except:
print("error2")
pass
driver.close()