참고 사항 및 기타/참고 사이트
Extracting Titles
JunGi Jeong
2021. 7. 13. 00:29
# 2.6
NomadCoder 니코쌤의 python 크롤링 강좌 indeed 사이트 title 추출하기
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class":"pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
# Exctracting Title
def extract_indeed_jobs(last_page):
jobs = []
#for page in range(last_page):
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class": "heading4 color-text-primary singleLineTitle tapItem-gutter"})
for result in results:
if result.find("h2", {"class": "jobTitle"}).find("span").string != "new":
title = result.find("h2", {"class": "jobTitle"}).find("span").string
print(title)
return jobs
indeed 사이트의 html 구조변경으로 인해 기존 강의로는 추출되지 않아 내 식대로 고쳐봤는데
new부분에서 좀 더 효율적으로 고칠 수 있을거 같다.
# 2.7 Extract company
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class":"pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
#heading4 color-text-primary singleLineTitle tapItem-gutter
def extract_indeed_jobs(last_page):
jobs = []
#for page in range(last_page):
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("td", {"class": "resultContent"})
for result in results:
if result.find("h2", {"class": "jobTitle"}).find("span").string != "new":
title = result.find("h2", {"class": "jobTitle"}).find("span").string
print(title)
company = result.find("div",{"class": "heading6 company_location tapItem-gutter"})
if company.find("a") != None:
print(company.find("a").string)
return jobs
#find_all -> get list
#find -> get first elemen
# Exctracting Title