참고 사항 및 기타/참고 사이트

Extracting Titles

# 2.6

NomadCoder 니코쌤의 python 크롤링 강좌 indeed 사이트 title 추출하기

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

def extract_indeed_pages():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")
  pagination = soup.find("div", {"class":"pagination"})

  links = pagination.find_all('a')
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
    
  max_page = pages[-1]
  return max_page


# Exctracting Title
def extract_indeed_jobs(last_page):
  jobs = []
  #for page in range(last_page):
  result = requests.get(f"{URL}&start={0*LIMIT}")
  soup = BeautifulSoup(result.text, "html.parser")
  results = soup.find_all("div", {"class": "heading4 color-text-primary singleLineTitle tapItem-gutter"})
  for result in results:
    if result.find("h2", {"class": "jobTitle"}).find("span").string != "new":
      title = result.find("h2", {"class": "jobTitle"}).find("span").string
      print(title)
  return jobs


indeed 사이트의 html 구조변경으로 인해 기존 강의로는 추출되지 않아 내 식대로 고쳐봤는데
new부분에서 좀 더 효율적으로 고칠 수 있을거 같다.

 

# 2.7 Extract company

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

def extract_indeed_pages():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")
  pagination = soup.find("div", {"class":"pagination"})

  links = pagination.find_all('a')
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
    
  max_page = pages[-1]
  return max_page

#heading4 color-text-primary singleLineTitle tapItem-gutter
def extract_indeed_jobs(last_page):
  jobs = []
  #for page in range(last_page):
  result = requests.get(f"{URL}&start={0*LIMIT}")
  soup = BeautifulSoup(result.text, "html.parser")
  results = soup.find_all("td", {"class": "resultContent"})
  for result in results:
    if result.find("h2", {"class": "jobTitle"}).find("span").string != "new":
      title = result.find("h2", {"class": "jobTitle"}).find("span").string
      print(title)
    company = result.find("div",{"class": "heading6 company_location tapItem-gutter"})
    if company.find("a") != None:
      print(company.find("a").string)
  return jobs

#find_all -> get list
#find -> get first elemen

  

# Exctracting Title