Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

Job.web.scrapping

File size: 5,115 Bytes

import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re

# wuzzuf function
def Wuzzuf_scrapping(job_type , job_num):
    job1 = job_type.split(" ")[0]
    job2 = job_type.split(" ")[1]
    link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
    title = []
    location = []
    country = []
    job_description = []
    Job_Requirements =[]
    company_name = []
    links = []
    Jop_type = []
    Career_Level = []
    company_logo = []
    Job_Categories = []
    Skills_And_Tools = []
    Experience_Needed =[]
    post_time = []
    Title = []
    pages_num = np.ceil(job_num/15)


    for i in range(int(pages_num) ):
      link_new = link1 +'&start='+str(i)
      try:
            data = requests.get(link_new)
            data.raise_for_status()  # Check for HTTP errors
            soup = BeautifulSoup(data.content, 'html.parser')
            Title = soup.find_all('h2', {'class': 'css-m604qf'})

            for x in range(len(Title)):
                title.append(Title[x].find('a').text.strip())
                loc = soup.find_all('span', {'class': 'css-5wys0k'})[x].text.split(',')
                location.append(loc[0].strip())
                country.append(loc[-1].strip())
                links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])

      except requests.exceptions.RequestException as e:
           # print(f"Request failed: {e}")
          continue  # Skip to the next page if there's an error
   

  # to get the info about jobs

      for x in range(0,len(Title)):
        t = re.split('\(|\-',Title[x].find('a').text)
        title.append(t[0].strip())
        loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
        r = ""
        for i in range(len(loc[:-1])):
          r= r+ ', ' +loc[:-1][i].strip()
        location.append(r.replace(',', '', 1).strip())
        country.append(loc[-1].strip())
        links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
        m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
        company_name.append(m)
        c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
        if len(c) ==1:
          Jop_type.append(c[0].text)
        else:
          n =[]
          for i in range(len(c)):
            n.append(c[i].text)
          Jop_type.append(n)
        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
        Career_Level.append(n[0].text)
        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])

        yy = n[1].text.replace('·',' ').strip()
        yy = re.findall('[0-9-+]*',yy)
        y1 =""
        for i in range(len(yy)):
        
          if any(yy[i]):
            y1 = y1+yy[i]
        if y1 != "":
          Experience_Needed.append(y1)
        else:
          Experience_Needed.append("Not Specified")
        time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
        post_time.append(time.text)
        
  # to get the logo of the company
      
        data1  = requests.get(links[x])
        soup1 = BeautifulSoup(data1.content)
        company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
        #time.sleep(4)


  # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
        driver = webdriver.Chrome('chromedriver',options=options)
        #driver.implicitly_wait(10)
        driver.get(links[x])
        Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
        Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
        job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
        all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
        dict_other = {}
    
        new = all[0].text.split("\n\n")

        if len(new)!=1 :
          for i in range(len(new)):
            result =[]
            for k in (new[i].split('\n')[1:]):  
              result.append(k.replace("\u202f"," "))
              dict_other[new[i].split('\n')[0]] = result

            #result = re.sub('[\W_]+', '', ini_string)

          Job_Requirements.append(dict_other)

        else:
          nn = new[0].replace("\u202f"," ")
          Job_Requirements.append(nn.split('\n'))


  #  create data frame to combine all together

    df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
    
    df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
    return df[:job_num]