
    ݫGi%                     l    d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
 d ZddZd Zd	 Zd
 Zy)z,
Utility functions for Link Checker package
    N)urlparse   )
ProxyErrorContentFetchErrorc                    t        d      D ]  }	 t        j                  ddd| d   d    id      }|j                          |j	                         }t        j                  |d	         }|d
   }t        |d   d         }|j                         dz   |z   }|dk(  s|st        d      |c S  	 t        d      5 }	|	j                         j                         }
ddd       t        j                  
      }|j                         S # t        $ r$}|dk  rY d}~t        d|        Y d}~"d}~ww xY w# 1 sw Y   ^xY w# t        $ r}t        d|       d}~ww xY w)z
    Get proxy from API or file with retry
    
    Args:
        config (dict): Configuration containing proxy settings
    
    Returns:
        str: Proxy string (ip:port)
       z/https://proxy.webshare.io/api/proxy/list?page=1AuthorizationzToken apiproxy_api_token
   )headerstimeoutresultsproxy_addressportshttp:unknownzReceived empty proxy from API   Nz)API proxy fetch failed after 3 attempts: z"/opt/aparser/files/proxy/proxy.txtz,Failed to get proxy from both API and file: )rangerequestsgetraise_for_statusjsonrandomchoicestrstripr   	Exceptionprintopenread
splitlines)configattemptresponser   proxy_detailsr   
proxy_portproxyefliness              5/var/www/html/utilities/link_checker_package/utils.py	get_proxyr.      su    8 G	G||A(F6%=AR3S2T*UVH
 %%'mmoG"MM')*<=M)/:M]73F;<J!'')C/*<E	! !@AALG.M67 	*1FFH'')E	*e${{}  	G{A!EFF		G	* 	*  MGsKLLMsN   BD/E :D6,E 	D3D.D..D36D?;E 	EEEc                    	 t        j                  | d|       	 t	        j                  |      j                  }|dkD  rt        j                  d|z   d       y
t        |      j                         }	 t        j                  d|z   d       |S # t         j                  $ rT t        d| d       	 t        j
                  j                  |      rt        j                  d|z   d       Y y#  Y Y yxY wt        $ r}t        d|        Y d	}~y
d	}~ww xY w# t        $ r0 	 t        |d      j                         }n# t        $ r Y Y y
w xY wY w xY w#  Y |S xY w)a  
    Get content using scraper command with timeout
    
    Args:
        command (str): Scraper command
        link_id (str): Link ID for file
        timeout (int): Timeout in seconds (default: 30)
    
    Returns:
        str: Content string, or "TIMEOUT" if timeout occurred
    T)shellr   zScraper timed out after z secondszrm )r0   TIMEOUTzScraper command failed: N i z
ISO-8859-1)encoding)
subprocesscheck_outputTimeoutExpiredr    ospathexistsr   statst_sizer!   r"   )commandlink_idr   r*   	file_sizecontents         r-   get_contentr@   7   sR   tWEGGG$,,	x##EGO4@w-$$&t< N? $$ (	:;	ww~~g&''tD 	 (,-  	7\:??AG 		 Nsv   B >D D 4E "D39C..C30D3D>DD	E#D?>E?	EEEEEEc                    	 | dd }|j                  d      }g }d}d}|D ]  }|sd|v rE|j                         }	 |d   j                         }t        |      d	k  r|}|j	                  |       d
|v sd|v sX	 |j                         }	|	d   j                         }t        |      }
|
j                  dk(  st        |
j                        dk  rd||gc S d|v rd||gc S d|v rd||gc S  t        |      dk(  rdd|gS |dk(  r|rd||gS |dk(  rd||gS |d|gS # t        $ r	 dddggcY S w xY w#  Y xY w# t        $ r Y #w xY w)z
    Test content and extract HTTP codes and redirects
    
    Args:
        content (str): Raw content from scraper
    
    Returns:
        list: [status_code, redirect_url, [all_codes]]
    r   i  
200nullr2   HTTPr      Locationz	location:/r   300
rapidgator301zredirect.se205302)splitr   r   lenappendr   r8   )r?   sub_contentsub_contentscodesres_urlcodeline
line_partscode_line_parts2os              r-   test_contentr[   g   s   (ao"((. EGD ~!ZZ\
&qM//1E5zA~$T* T![D%8"&**,K)!n224G )Avv}AFFa %w66#w. %w66$/ %w66 0-: 5zQvu%%u}7E**u}w&& &%  _  (vw''(   ! s;   D 4D09AD7D7 D7D-,D-0D47	EEc           	      6   ddddddddd}|rp|j                  d      rd|d<   dj                  |d         |d<   |j                  d	      rd|d
<   dj                  |d	         |d<   |j                  d      rd|d<   |r|j                  d      |d   |d<   |j                  d      r|d   |d<   |j                  d
      |d
   |d
<   |j                  d      r|d   |d<   |j                  d      |d   |d<   |j                  d      |d   |d<   |j                  d      |d   |d<   |j                  d      |d   |d<   | r| j                  d      | d   |d<   | j                  d      r| d   |d<   | j                  d
      | d
   |d
<   | j                  d      r| d   |d<   | j                  d      }|dv rd|d<   n|dv r	 | j                  d      | d   |d<   | j                  d      | d   |d<   | j                  d      | d   |d<   |S )a5  
    Merge settings in hierarchical order: website > content_type > global
    
    Args:
        website_settings (dict): Website-specific settings
        content_type_settings (dict): Content type settings
        global_settings (dict): Global settings
    
    Returns:
        dict: Merged settings
    FN)check_http_code
http_codescheck_stop_words
stop_wordshomepage_redirectcheck_stop_words_on_pageuse_seleniumuse_playwright	http_codeTr]   ,r^   stopwordr_   r`   redirectra   rb   rc   rd   	stopwords)r   1Ttrue)r   0FfalseNr2   None)r   join)website_settingscontent_type_settingsglobal_settingsmergedws_vals        r-   merge_settingsru      s    !!"$)	F {+(,F$%#&88OK,H#IF< z*)-F%&#&88OJ,G#HF< z**.F&'  $$%67C(=>O(PF$% $$\2#8#FF<  $$%78D)>?Q)RF%& $$\2#8#FF<  $$%89E*?@S*TF&' $$%?@L1FGa1bF-. $$^4@%:>%JF>" $$%56B'<=M'NF#$  12>(89J(KF$%-#3L#AF<  23?)9:L)MF%&,#3K#@F<  "%%&9:++*.F&'AA :;G1AB\1]F-./;%5n%EF>" 01='78H'IF#$M    c                 f   t        |       }|j                  j                  d      }t        |      dkD  r*d}|D ]  }|dk7  s	||dz   z  } |j	                  d      }|S |j                  j                  d      r|j                  j	                  d      }|S |j                  j	                         }|S )z
    Parse URL and extract display link
    
    Args:
        url (str): URL to parse
    
    Returns:
        str: Display link
    .r   r2   wwwzwww.)r   netlocrN   rO   r   
startswith)urlrZ   display_linksdisplaysalinkdisplay_links         r-   	parse_urlr      s     	AHHNN3'M
=A" 	(E~ECK'	(  ~~c*  88u%88>>&1L  88>>+Lrv   )   )__doc__r7   timer   r4   r   urllib.parser   
exceptionsr   r   r.   r@   r[   ru   r    rv   r-   <module>r      s@    
     ! 5(MT.`>"@Obrv   