
    ݫGi_                        d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ 	 ddlmZmZ d Zd Zd Zd Zd Zd Zd Zd Z d Z!ddZ"d Z#y# e$ r  G d d      Z G d d      ZY >w xY w)z3
Content fetching methods for Link Checker package
    N)urlparse)	webdriver)Service)sync_playwright)requests   )ContentFetchErrorSocialMediaError)	get_proxy)ForeStylec                       e Zd ZdZdZdZy)r    N)__name__
__module____qualname__YELLOWGREENRED     @/var/www/html/utilities/link_checker_package/content_fetchers.pyr   r      s    r   r   c                       e Zd ZdZy)r   r   N)r   r   r   	RESET_ALLr   r   r   r   r      s    	r   r   c                    	 t        j                         }|j                  d       |j                  d       |j                  d       |j                  d       |r|dk7  r|j                  d|z          |j                  d| z          |j                  d       |j                  d	d
g       |j                  dd       |j                  d       t	        |d   d         }t        j
                  ||      }|j                  d       |j                  d       |j                  |       t        j                  d       |j                  }|j                  }	|j                  }
|j                          |rd|v sd|v sd|v rd|v rt        t         j"                  dz   t$        j&                  z          t)        |      }|r@t+        |      dkD  r2t        t         j,                  dz   t$        j&                  z          d||
gS t        t         j.                  dz   t$        j&                  z          g dS g dS d|v sd|v sd|v rg d S d!|v rg d S d"|v rg d#S t+        |	      dk  rg d$S t1        |
      }|j2                  d%k(  st+        |j2                        d&k  rd'|	|
gS d|	|
gS # t4        $ r}t7        d(|       d)}~ww xY w)*a>  
    Fetch content using Selenium
    
    Args:
        useragent (str): User agent string
        url (str): URL to fetch
        proxy (str): Proxy string
        cloudflare (bool): Whether to handle Cloudflare
        config (dict): Configuration
    
    Returns:
        list: [status, content, current_url]
    z--no-sandboxz--disable-gpuheadlesszwindow-size=1280,800unknownz--proxy=http://z--user-agent=z:--no-first-run --no-service-autorun --password-store=basicexcludeSwitcheszenable-automationuseAutomationExtensionFz---disable-blink-features=AutomationControlledpathschromedriver_path)serviceoptionszEObject.defineProperty(navigator, 'webdriver', {get: () => undefined})<      Access denied
CloudflareJust a moment)Just a moment... detected, trying Aparser2   &Aparser successfully extracted content200Aparser failed, returning 403403r   r   z	not foundErreurError)404r   r   r2   429)r3   r   r   )600r   r   /   300zSelenium content fetch failed: N)r   ChromeOptionsadd_argumentadd_experimental_optionr   Chromeexecute_scriptset_page_load_timeoutgettimesleeptitlepage_sourcecurrent_urlquitprintr   r   r   r   AparserLinkExtractorlenr   r   r   path	Exceptionr	   )	useragenturlproxy
cloudflareconfigr#   sdriverrA   contentrC   aparser_contentoes                 r   selenium_contentrU      s   ;G))+^,_-Z(34Ui'  !7%!?@_y89YZ''(9<O;PQ''(@%HLMF7O$789!!!W=ef$$R(

3

2$$((%'<5+@OW\D\"e+$++(SSV[VeVeef&:3&?O&3+?"+Ddjj+SSV[VeVeef %DDdhh)HH5??Z[..**%8u#458H""E>""E>""w<"""[!66S=CK!O7K00w,, G"A! EFFGsH   G=J#  0J# 1J# 5J# J# J# J# '6J# J# #	J?,J::J?c                 r   	 dd|z   i}t        |||      }|d   }	|d   }
|d   }|d   }t               5 }|dk(  s|s|j                  j                         }n|j                  j                  |      }|	|
|j	                  | d
d      }n0|j	                  | |d
t        |
      t        |	      ddgdd|i      }|j                         }|j                  d       	 |j                  |d       |j                         }|j                  }|j                         }|j                          |j                          |rd|v sd|v sd|v sd|v rd|v rt!        t"        j$                  dz   t&        j(                  z          t+        |      }|rIt-        |      dkD  r;t!        t"        j.                  d z   t&        j(                  z          d!||gcd	d	d	       S t!        t"        j0                  d"z   t&        j(                  z          g dcd	d	d	       S g dcd	d	d	       S d#|v rd!d|gcd	d	d	       S t3        |      }|j4                  d$k(  st-        |j4                        d%k  rd&||gcd	d	d	       S t-        |      d'k  rt        | ||||      cd	d	d	       S d!||gcd	d	d	       S # t        $ r}dt        |      v r2|j                          |j                          ddgcY d	}~cd	d	d	       S dt        |      v rg dcY d	}~cd	d	d	       S dt        |      v r=|j                          |j                          t        | ||||      cY d	}~cd	d	d	       S |j                          |j                          g dcY d	}~cd	d	d	       S d	}~ww xY w# 1 sw Y   y	xY w# t        $ r}t7        d(|       d	}~ww xY w))a~  
    Fetch content using Playwright
    
    Args:
        useragent (str): User agent string
        url (str): URL to fetch
        cur: Database cursor
        con: Database connection
        proxy (str): Proxy string
        cloudflare (bool): Whether to handle Cloudflare
        config (dict): Configuration
    
    Returns:
        list: [status, content, current_url]
    serverhttp://latitude	longitudetimezonelocaler   )rL   Nen-UST)
user_agentr\   ignore_https_errors)rZ   rY   geolocationAccept-Language)r^   timezone_idr\   r`   permissionsr_   extra_http_headersi`  load)
wait_untilSSL_ERROR_UNKNOWNr   NS_ERROR_PROXY_FORBIDDENr.   Timeout)r,   r   r   r&   r'   r/   r(   r)   r*   r+   r,   r-   z
/search?q=r5   r6   r7      z!Playwright content fetch failed: )get_locationr   firefoxlaunchnew_contextfloatnew_pageset_default_timeoutgotorI   strcloserU   rA   rK   rQ   rE   r   r   r   r   rF   rG   r   r   r   rH   r	   )rJ   rK   curconrL   rM   rN   proxy_to_uselocationlatlngr[   r\   pbrowsercontextpagerT   rA   rC   rQ   rR   rS   s                          r   playwright_contentr   i   s    `Ii%'

  S%0z"{#J'(# Q	1!iu))**,))***>{ck!--("(, .  "--( (".3CjeCj Q!.(,)6( . 
 ##%D$$U++		#&	1$ JJLE((KllnGJJLMMO"e+|u/DQVZimrZr&%/dkk,WWZ_ZiZiij*>s*C*s?/Cb/H!$**/W"WZ_ZiZi"ij$)?K#HAQ	1 Q	1D "$((-L"Lu"^_#2GQ	1 Q	1J  /KQ	1 Q	1N {*r;/QQ	1 Q	1T %Avv}AFFaw4YQ	1 Q	1\ 7|c!'	3z6R_Q	1 Q	1b 7K0cQ	1 Q	1D  +&#a&0JJLMMO8OMQ	1 Q	1N 03q69**QQ	1 Q	1R #a&(JJLMMO+IsE:vVVYQ	1 Q	1\ JJLMMO**aQ	1 Q	1D+EQ	1 Q	1f  I"CA3 GHHIs   2N B)NJ#1CN	N 0N	N N	N N%	N /6N%	N /N	N N	N #	N,0NNN!	N +N;N<N 	N 
;NNN
	N #N7N8N<	N NNNN N 	N6#N11N6c                    	 t        |      }dd|z   i}|d   }d|z   dz   }d}t        |d      5 }	|	j                  |       ddd       t               5 }
|
j                  j                  d	|
      }|j                  | |dd      }|j                  j                  d	d	d	       |j                         }|j                  d       	 |j                  |       t        j                  d       |j#                         }|j$                  }|j'                         }|j!                          |j!                          |cddd       S # 1 sw Y   xY w# t        $ rD}dt        |      v r-|j!                          |j!                          Y d}~ddd       yY d}~d}~ww xY w# 1 sw Y   yxY w# t        $ r}t)        d|       d}~ww xY w)z
    Process Instagram URLs
    
    Args:
        useragent (str): User agent string
        url (str): URL to fetch
        insta_user (dict): Instagram user data
        config (dict): Configuration
    
    Returns:
        str: Content string
    rW   rX   cookies{"cookies": }zinsta_cookies.jsonwNTr   rL   r]   Europe/Tallinnr^   storage_stater\   rb   screenshots	snapshotssources 
   ri   r   zInstagram processing failed: )r   openwriter   chromiumrm   rn   tracingstartrp   rq   rr   r?   r@   rI   rs   rt   rA   rK   rQ   r
   )rJ   rK   
insta_userrN   rL   rw   r   new_cookiescookies_filefhr{   r|   r}   r~   rT   rA   rC   rQ   s                     r   process_instagramr      s   +D&!i%'
 Y'$w.4+,$ 	"HH[!	"  	!jj''\'JG))$*,	 * G OO!!ddD!Q##%D$$V,		#

2 JJLE((KllnGJJLMMO7	 		" 	"$  A&JJLMMO'	 	  '	 	:  D!>qcBCCDs   -F, EF, A2F &E,AF 9	F, EF, 	F-FF 
F, F FF  F)%F, )F, ,	G5GGc                    	 t        |      }dd|z   i}|d   }d|z   dz   }d}	t        |	d      5 }
|
j                  |       ddd       t               5 }|j                  j                  d	|
      }|j                  | |	dd      }|j                  j                  d	d	d	       |j                         }|j                  d       	 |j                  |       t        j                  d       |j!                         }|j"                  }|j%                         }d|v sd|vr
	 ddd       y|j                          |j                          |cddd       S # 1 sw Y   xY w# t        $ r2}|j                          |j                          Y d}~ddd       yd}~ww xY w# 1 sw Y   yxY w# t        $ r}t'        d|       d}~ww xY w)a  
    Process Facebook URLs
    
    Args:
        useragent (str): User agent string
        cur: Database cursor
        url (str): URL to fetch
        fb_user (dict): Facebook user data
        config (dict): Configuration
    
    Returns:
        str: Content string
    rW   rX   r   r   r   zfacebook_cookies.jsonr   NTr   r]   r   r   r   r   r%   r   zLog into Facebookz
logout.phpzFacebook processing failed: r   r   r   r   r   rm   rn   r   r   rp   rq   rr   r?   r@   rI   rt   rA   rK   rQ   r
   )rJ   ru   rK   fb_userrN   rL   rw   r   r   r   r   r{   r|   r}   r~   rT   rA   rC   rQ   s                      r   process_facebookr     s   .C&!i%'
 )$$w.4.,$ 	"HH[!	"  	!jj''\'JG))$*,	 * G OO!!ddD!Q##%D$$V,		#

2 JJLE((KllnG"g-W1L3	 	6 JJLMMO=	 		" 	"$  

%	 		 	@  C!=aSABBCs   -F, EF, A2F &E",5F !F, *!F 	F, EF, "	F+ FF F, FF  F)%F, )F, ,	G5GGc                 r   	 t        |      }dd|z   i}|d   }d|z   dz   }d}t        |d      5 }	|	j                  |       ddd       t               5 }
|
j                  j                  d	|
      }|j                  | |dd      }|j                  j                  d	d	d	       |j                         }|j                  d       	 |j                  |       t        j                  d       |j!                         }|j"                  }|j%                         }|j                          |j                          |cddd       S # 1 sw Y   xY w# t        $ r2}|j                          |j                          Y d}~ddd       yd}~ww xY w# 1 sw Y   yxY w# t        $ r}t'        d|       d}~ww xY w)z
    Process TikTok URLs
    
    Args:
        useragent (str): User agent string
        url (str): URL to fetch
        tiktok_user (dict): TikTok user data
        config (dict): Configuration
    
    Returns:
        str: Content string
    rW   rX   r   r   r   ztiktok_cookies.jsonr   NTr   r]   r   r   r   r   r   r   zTikTok processing failed: r   )rJ   rK   tiktok_userrN   rL   rw   r   r   r   r   r{   r|   r}   r~   rT   rA   rC   rQ   s                     r   process_tiktokr   S     *A&!i%'
 i($w.4,,$ 	"HH[!	"  	!jj''\'JG))$*,	 * G OO!!ddD!Q##%D$$V,		#

2 JJLE((KllnGJJLMMO5	 		" 	"$  

%	 		 	8  A!;A3?@@A~   -F EF A2F&E,AF9	F EF 	F F9F=F FFFF F 	F6#F11F6c                 r   	 t        |      }dd|z   i}|d   }d|z   dz   }d}t        |d      5 }	|	j                  |       ddd       t               5 }
|
j                  j                  d	|
      }|j                  | |dd      }|j                  j                  d	d	d	       |j                         }|j                  d       	 |j                  |       t        j                  d       |j!                         }|j"                  }|j%                         }|j                          |j                          |cddd       S # 1 sw Y   xY w# t        $ r2}|j                          |j                          Y d}~ddd       yd}~ww xY w# 1 sw Y   yxY w# t        $ r}t'        d|       d}~ww xY w)z
    Process Reddit URLs
    
    Args:
        useragent (str): User agent string
        url (str): URL to fetch
        reddit_user (dict): Reddit user data
        config (dict): Configuration
    
    Returns:
        str: Content string
    rW   rX   r   r   r   zreddit_cookies.jsonr   NTr   r]   r   r   r   r   r   r   zReddit processing failed: r   )rJ   rK   reddit_userrN   rL   rw   r   r   r   r   r{   r|   r}   r~   rT   rA   rC   rQ   s                     r   process_redditr     r   r   c           	      B   	 d}| j                  ||g       | j                  rY| j                         }|d   }|d   j                  d      }|d   |d   |d   |d   |d	   |d
   |d   j	                         |d}|S dddddddd}|S # t
        $ r ddddddddcY S w xY w)z
    Get location data for proxy
    
    Args:
        cur: Database cursor
        con: Database connection
        proxy (str): Proxy string
    
    Returns:
        dict: Location data
    z.SELECT * FROM proxy_locations WHERE proxy = %sr   r\   ,cityregioncountryry   rz   r[   )r   r   r   rY   rZ   r[   r\   rL   N)r   r   r   rY   rZ   r[   r\   )executerowcountfetchallsplitstriprI   )ru   rv   rL   sql
proxy_dataresponselocalocation_datas           r   rk   rk     s    %
>C%!<<J!!}HH%++C0D ("8,#I.$UO%e_$Z0q'--/	M(   ! M  	

 	
	
s   A9B <B BBc                     	 t        d      j                         j                         } t        j                  |       }|j                         S # t        $ r}t        d|        	 t        d      j                         j                         } t        j                  |       }|j                         cY d}~S # t        $ r}t        d|        Y d}~Y d}~yd}~ww xY wd}~ww xY w)zd
    Fetch a random proxy from proxy file
    
    Returns:
        str: Proxy string (ip:port)
    z/root/flask/proxies/proxy.txtz Error fetching proxy from file: z"/opt/aparser/files/proxy/proxy.txtNz)Error fetching proxy from fallback file: z127.0.0.1:8080)r   read
splitlinesrandomchoicer   rI   rE   )linesrL   rT   e2s       r   fetch_proxy_from_filer     s    $45::<GGIe${{} 	$045	$=>CCEPPREMM%(E;;=  	$=bTBC##	$	$sC   AA 	C$C&AB71C$7	C CCCCC$c                      d} 	 t        j                  | d      }|j                  d      j                         }|S # t        $ r}t        d|        Y d}~yd}~ww xY w)z]
    Get user agent from Node.js script
    
    Returns:
        str: User agent string
    znode /root/user_agents.jsT)shellzutf-8zError getting user agent: NzoMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36)
subprocesscheck_outputdecoder   BaseExceptionrE   )commandrJ   rT   s      r   get_user_agent_from_scriptr     sh     *GA++G4@	$$W-335	 A*1#./ AAs   7< 	AAAc                 
   t        t        j                  d|  z   t        j                  z          |s
t               }|s
t               }d| }||d}t        d|        t        d|dd  d       |d	d
ddddddddd}t        j                         }t        |      D ]F  }	 t        j                         }	t        d|dz    d| d       |j                  | ||ddddd      }
t        j                         |	z
  }t        d|dd       t        d |
j                          t        d!|
j                          d}t        |       }t        t        |
j                              }t!        |j"                        d"kD  }|j"                  dk(  xs t!        |j"                        d"k  }|rT|rRd}|j$                  |j$                  k7  rt        d#|  d$|
j                          nt        d%|  d$|
j                          |
j                  d&k(  rT||dz
  k  rLt        d'|dz    d| d(       t        j&                  d"       	 d)t)               v s	 |j+                          t!        |
j,                        }|
j.                  j                  d*      }t        d+| d,       |r0t1        |      }t        d-| d,       ||k  rt        d.| d/|        |
j                  d0v rEd1|
j2                  t        |
j                        |gd)t)               v r	 |j+                          c S c S |
j                  d2v r\|rd3nt        |
j                        |
j2                  t        |
j                        |gd)t)               v r	 |j+                          c S c S t        |
j                        |
j2                  t        |
j                        |gd)t)               v r	 |j+                          c S c S  d6d7| dgS #  Y UxY w#  Y c S xY w#  Y c S xY w#  Y c S xY w# t        j4                  $ rq}t        d4|dz    d5|        ||dz
  k(  r6d6d7| dgcY d}~d)t)               v r	 |j+                          c S #  Y c S xY wc S t        j&                  d"       Y d}~nd}~wt6        $ r}t        d8|dz    d5|        ||dz
  k(  rJd9dl}|j;                          d6d7| dgcY d}~d)t)               v r	 |j+                          c S #  Y c S xY wc S t        j&                  d"       Y d}~nd}~ww xY wd)t)               v s	 |j+                          #  Y xY w# d)t)               v r	 |j+                          w #  Y w xY ww xY w):au  
    Fetch content using curl_cffi with proxy support and retry logic
    
    Args:
        url (str): URL to fetch
        useragent (str): User agent string (optional)
        proxy (str): Proxy string (optional)
        max_retries (int): Maximum number of retries for 500 errors
    
    Returns:
        list: [status_code, content, final_url, redirect_detected]
    zUsing curl_cffi for rX   )httphttpszUsing proxy: zUsing User-Agent: Nr*   z...z`text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8zen-US,en;q=0.9zgzip, deflate, brzhttps://www.google.com/z
keep-alive1documentnavigatenonez	max-age=0)z
User-AgentAcceptra   zAccept-EncodingReferer
ConnectionzUpgrade-Insecure-RequestszSec-Fetch-DestzSec-Fetch-ModezSec-Fetch-SitezCache-ControlzFetching URL (attempt r   r5   z)...F)r$   x   T	chrome110)proxiesheadersverifytimeoutallow_redirectsstreamimpersonatezRequest completed in z.2fz secondszStatus: zFinal URL: r6   z%Cross-domain home redirect detected: z -> zHome redirect detected: i  z*Got 500 status code, retrying... (attempt )sessionzContent-LengthzContent length: z byteszExpected length: z(WARNING: Content may be incomplete! Got z bytes, expected )rj         r,   )i,  i-  i.  i/  i3  i4  r7   zRequest error (attempt z): 500r   zError (attempt r   )rE   r   r   r   r   r   r   curl_requestsSessionranger?   r>   status_coderK   r   rs   rG   rH   netlocr@   localsrt   rQ   r   inttextRequestsErrorrI   	traceback	print_exc)rK   rJ   rL   max_retries	proxy_urlr   r   r   attempt
start_timer   elapsed_timeredirect_detectedoriginal_parsedfinal_parsedoriginal_has_pathfinal_is_homepagecontent_lengthcontent_length_headerexpected_lengthrT   r   s                         r   curl_cffi_contentr     s    
$++.se4
4u
FG %'.0	%!I 95G	M%
!"	y"~.c
23  t+.,"%($$ $G ##%G% RQ	J*7Q;-qTJK{{! $' # 	H  99;3L),s);8DEHX11234K~./ !&&smO#C$56L !$O$8$8 9A =!-!2!2c!9!WSARAR=SVW=W %6$(!"))\-@-@@A#d8<<.YZ4SEhll^LM ##s*wq/HB7Q;-qQ\P]]^_`

1H FH$MMOG !!1!12N$,$4$4$8$89I$J!$^$4F;<$"%&;"<)/):&AB!O3D^DTTefuevwx ##6x}}c(,,.?ARS* FH$MMO %) %%)GG!2H<P<P8QS[S`S`befnfrfrbs  vG  H& FH$MMO %# H0018==#hllBSUfg" FH$MMO %]Rh 2sE""' ** 	+GaK=A3?@+/)r3.. FH$MMO	 % JJqMM 	OGaK=A378+/) ##%r3.. FH$MMO	 % JJqMM	 FH$MMO	 FH$MMO	 %s   %FPO0#B!PO8'AP;P5PP
0O58O>P
PT#&!RT#UQ,,Q28RUT#5TT#U%S88S>TUT##U5UUU6U.-U6.U2	0U6c                 D   t        t        j                  dz   | z   t        j                  z          d}dddd| ddd	}d
dd}	 t        j                  |t        j                  |      |      }|j                         }|j                  d      dk(  r5t        t        j                  dz   t        j                  z          |d   d   S t        t        j                  dz   t        j                  z          t        |       y# t        $ r}t        |       Y d}~yd}~ww xY w)z
    Extract content using Aparser when Cloudflare protection is detected
    
    Args:
        url (str): URL to extract content from
    
    Returns:
        str: Extracted content or empty string if failed
    zUSING APARSER zhttp://pma6.remov.ee:9096/API
oneRequestz	Net::HTTP
secondpackdefault)parserpresetconfigPresetquerytWCj3yvB4v4US2wjfrSC)actiondatapasswordzapplication/textzUTF-8)zcontent-typezAccept-Charset)r   r   successr   zAParser response receivedr   resultStringzAParser failed to fetch contentr   N)rE   r   r   r   r   r   postjsondumpsr>   r   r   rI   )rK   
parser_urlpayloadr   resr   rT   s          r   rF   rF     s     
$++(
(3
.
@A0J!"%	
 +	G  2WMGmmJTZZ-@'R88:<<	"a'$**::U__LMF#N33$((>>PQ(O as   BD 8D 	D
DD)NN   )$__doc__r?   r   r   r  r   urllib.parser   seleniumr   !selenium.webdriver.chrome.servicer   playwright.sync_apir   	curl_cffir   
exceptionsr	   r
   utilsr   coloramar   r   ImportErrorrU   r   r   r   r   r   rk   r   r   r   rF   r   r   r   <module>r     s         !  5 / / ; 	$IGVpId8Dt<C|7Ar7Ar1
f$,A @#D&W    s   A/ /B
B