"""
Core LinkChecker class for the package
"""

import time
import re
from bs4 import BeautifulSoup
from random import randint
from .content_fetchers import AparserLinkExtractor
from colorama import Fore, Back, Style
import requests
from urllib.parse import urlparse
from .config import get_config
from .database import (
    get_connection, get_website_settings, get_content_type_settings,
    get_global_deletion_settings, get_project, get_user_agent,
    get_cookies, get_deleted_keywords, is_cloudflare_website, get_project_title_score
)
from .utils import (
    get_proxy, get_content, test_content, merge_settings, parse_url
)
from .content_fetchers import (
    selenium_content, playwright_content, process_instagram,
    process_facebook, process_tiktok, process_reddit, curl_cffi_content
)
from .exceptions import LinkCheckerError, ConfigurationError
from rapidfuzz import fuzz

class LinkChecker:
    """
    Main LinkChecker class that orchestrates link checking functionality
    """
    
    def __init__(self, server_type=None, config_file=None):
        """
        Initialize LinkChecker
        
        Args:
            server_type (str): Type of server (autocheck, linkverification, etc.)
            config_file (str): Path to custom config file
        """
        self.config = get_config(server_type, config_file)
        self.con = None
        self.cur = None
    
    def __enter__(self):
        """Context manager entry"""
        self.connect()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self.disconnect()
    
    def connect(self):
        """Connect to database"""
        self.con, self.cur = get_connection(self.config['database'])
    
    def disconnect(self):
        """Disconnect from database"""
        if self.cur:
            self.cur.close()
        if self.con:
            self.con.close()
    
    def _ensure_database_connection(self):
        """
        Ensure database connection is active, reconnect if needed
        
        Returns:
            bool: True if connected, False if connection failed
        """
        try:
            # Check if cursor exists and connection is alive
            if not self.cur:
                print("Database cursor not found, attempting to connect...")
                self.connect()
                return True
            
            # Test the connection with a simple query
            self.cur.execute("SELECT 1")
            return True
            
        except Exception as e:
            print(f"Database connection lost, attempting to reconnect: {e}")
            try:
                # Close existing connection if any
                if hasattr(self, 'conn') and self.conn:
                    try:
                        self.conn.close()
                    except:
                        pass
                
                # Reconnect
                self.connect()
                print("Database reconnected successfully")
                return True
                
            except Exception as reconnect_error:
                print(f"Failed to reconnect to database: {reconnect_error}")
                return False
    
    def check_link(self, url, project_id=None, **kwargs):
        """
        Main method to check a link
        
        Args:
            url (str): URL to check
            project_id (int): Project ID (optional)
            **kwargs: Additional parameters
        
        Returns:
            dict: Check result with status, content, and analysis
        """
        try:
            # Parse URL and get display link
            display_link = parse_url(url)
            
            # Ensure database connection is active
            if not self._ensure_database_connection():
                print("Database connection failed, using fallback mode")
                return self._check_link_without_database(url, project_id, **kwargs)
            
            # Get website settings
            if not self._ensure_database_connection():
                print("Database connection lost during website settings fetch, using fallback mode")
                return self._check_link_without_database(url, project_id, **kwargs)
            
            website_data = get_website_settings(self.cur, display_link)
            website_settings = website_data['website_settings']
            hosting_email = website_data['hosting_email']
            cloudflare = is_cloudflare_website(hosting_email)
            
            # Get content type settings if project_id provided
            content_type_settings = {}
            if project_id:
                if not self._ensure_database_connection():
                    print("Database connection lost during content type settings fetch, using fallback mode")
                    return self._check_link_without_database(url, project_id, **kwargs)
                content_type_settings = get_content_type_settings(self.cur, project_id)
            
            # Get global settings
            if not self._ensure_database_connection():
                print("Database connection lost during global settings fetch, using fallback mode")
                return self._check_link_without_database(url, project_id, **kwargs)
            global_settings = get_global_deletion_settings(self.cur)
            
            # Extract settings - check website first, then global
            check_http_code = website_settings.get('check_http_code')
            http_codes = website_settings.get('http_codes')
            check_stop_words = website_settings.get('check_stop_words')
            stop_words = website_settings.get('stopwords')  # Note: website uses 'stopwords', global uses 'stopword'
            homepage_redirect = website_settings.get('homepage_redirect')
            check_stop_words_on_page = website_settings.get('check_stop_words_on_page')
            use_selenium = website_settings.get('use_selenium')
            use_playwright = website_settings.get('use_playwright')
            
            # Check if ANY website setting is configured
            website_has_settings = any([
                check_http_code not in (None, 0),
                http_codes not in (None, [], {}),
                check_stop_words not in (None, 0),
                stop_words not in (None, [], {}),
                homepage_redirect not in (None, 0)
            ])
            print(f"Website settings for {display_link}: {website_settings}, has_settings={website_has_settings}")
            
            # If website has ANY settings, use ONLY website settings (no global fallback)
            if website_has_settings:
                print(f"Website has settings configured, using ONLY website settings")
                # Use website settings as-is, don't fall back to global
                if check_http_code is None:
                    check_http_code = False
                if http_codes is None:
                    http_codes = None
                if check_stop_words is None:
                    check_stop_words = False
                if stop_words is None:
                    stop_words = None
                if homepage_redirect is None:
                    homepage_redirect = False
            else:
                print(f"No website settings found, using global settings")
                # Only use global settings if NO website settings exist
                if global_settings.get('http_code',''):
                    http_codes = ','.join(global_settings['http_code'])
                if global_settings.get('stopword', ''):
                    stop_words = ','.join(global_settings['stopword'])
                if global_settings.get('redirect',0):
                    homepage_redirect = global_settings.get('redirect') is not None

            # Set defaults for optional settings (these don't affect deletion criteria)
            if check_stop_words_on_page is None:
                check_stop_words_on_page = False  # Default to False if not specified
            if use_selenium is None:
                use_selenium = False  # Default to False if not specified
            if use_playwright is None:
                use_playwright = False  # Default to False if not specified
            
            print(f"Settings: selenium={use_selenium}, playwright={use_playwright}, http_codes={http_codes}, homepage_redirect={homepage_redirect}")
            
            # Parse codes and stop words
            codes_to_check = []
            stop_words_to_check = []
            if http_codes:
                codes_to_check = [c.strip() for c in str(http_codes).split(',') if c.strip()]
            if stop_words:
                stop_words_to_check = [w.strip() for w in str(stop_words).split(',') if w.strip()]
            
            # Get proxy and user agent
            try:
                proxy = get_proxy(self.config)
            except Exception as e:
                print(f"Failed to get proxy: {e}")
                proxy = "127.0.0.1:8080"  # Fallback proxy
            
            try:
                if not self._ensure_database_connection():
                    print("Database connection lost during user agent fetch, using fallback user agent")
                    useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
                else:
                    useragent = get_user_agent(self.cur)
                    if not useragent:
                        useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            except Exception as e:
                print(f"Failed to get user agent: {e}")
                useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            
            # Fetch content based on platform and settings
            print(f"Fetching content for {display_link} using user agent: {useragent} : proxy={proxy}, cloudflare={cloudflare} : selenium={use_selenium}, playwright={use_playwright}")
            content, test_result = self._fetch_content(
                url, display_link, useragent, proxy, cloudflare,
                use_selenium, use_playwright
            )
            
            print(f"Fetched content for {display_link} with status: {test_result}")
            
            # Analyze content
            analysis_result = self._analyze_content(
                content, test_result, check_http_code, codes_to_check,
                check_stop_words, stop_words_to_check, homepage_redirect,
                check_stop_words_on_page, display_link, global_settings, cloudflare, url, website_has_settings
            )
            print(f"Analysis result for {display_link}: {analysis_result}")
            
            return {
                'url': url,
                'display_link': display_link,
                'status': analysis_result['status'],
                'content': content,
                'test_result': test_result,
                'analysis': analysis_result,
                'settings': {
                    'check_http_code': check_http_code,
                    'http_codes': http_codes,
                    'check_stop_words': check_stop_words,
                    'stop_words': stop_words,
                    'homepage_redirect': homepage_redirect,
                    'check_stop_words_on_page': check_stop_words_on_page,
                    'use_selenium': use_selenium,
                    'use_playwright': use_playwright
                },
                'proxy': proxy,
                'useragent': useragent
            }
            
        except Exception as e:
            import traceback
            traceback.print_exc()
            raise LinkCheckerError(f"Link check failed: {e}")
    
    def _check_link_without_database(self, url, project_id=None, **kwargs):
        """
        Check a link without database connection (fallback method)
        
        Args:
            url (str): URL to check
            project_id (int): Project ID (optional)
            **kwargs: Additional parameters
        
        Returns:
            dict: Basic check result
        """
        try:
            print(f"Checking link without database: {url}")
            
            # Parse URL and get display link
            display_link = parse_url(url)
            
            # Use default settings - no website settings, use global only
            website_settings = {}
            content_type_settings = {}
            global_settings = {
                'http_code': ['404', '410', '451'],
                'stopword': [],
                'redirect': [],
                'skip_code': []
            }
            
            # Extract settings - check website first, then global
            check_http_code = website_settings.get('check_http_code')
            http_codes = website_settings.get('http_codes')
            check_stop_words = website_settings.get('check_stop_words')
            stop_words = website_settings.get('stopwords')
            homepage_redirect = website_settings.get('homepage_redirect')
            check_stop_words_on_page = website_settings.get('check_stop_words_on_page')
            use_selenium = website_settings.get('use_selenium')
            use_playwright = website_settings.get('use_playwright')
            
            # Check if ANY website setting is configured
            website_has_settings = any([
                check_http_code is not None,
                http_codes is not None,
                check_stop_words is not None,
                stop_words is not None,
                homepage_redirect is not None
            ])
            
            # If website has ANY settings, use ONLY website settings (no global fallback)
            if website_has_settings:
                print(f"Website has settings configured, using ONLY website settings")
                # Use website settings as-is, don't fall back to global
                if check_http_code is None:
                    check_http_code = False
                if http_codes is None:
                    http_codes = None
                if check_stop_words is None:
                    check_stop_words = False
                if stop_words is None:
                    stop_words = None
                if homepage_redirect is None:
                    homepage_redirect = False
            else:
                print(f"No website settings found, using global settings")
                # Only use global settings if NO website settings exist
                if check_http_code is None:
                    check_http_code = global_settings.get('http_code') is not None
                if http_codes is None and global_settings.get('http_code'):
                    http_codes = ','.join(global_settings['http_code'])
                if check_stop_words is None:
                    check_stop_words = global_settings.get('stopword') is not None
                if stop_words is None and global_settings.get('stopword'):
                    stop_words = ','.join(global_settings['stopword'])
                if homepage_redirect is None:
                    homepage_redirect = global_settings.get('redirect') is not None
            
            # Set defaults for optional settings (these don't affect deletion criteria)
            if check_stop_words_on_page is None:
                check_stop_words_on_page = False
            if use_selenium is None:
                use_selenium = False
            if use_playwright is None:
                use_playwright = False
            
            print(f"Settings: selenium={use_selenium}, playwright={use_playwright}, http_codes={http_codes}, homepage_redirect={homepage_redirect}")
            
            # Parse codes and stop words
            codes_to_check = []
            stop_words_to_check = []
            if http_codes:
                codes_to_check = [c.strip() for c in str(http_codes).split(',') if c.strip()]
            if stop_words:
                stop_words_to_check = [w.strip() for w in str(stop_words).split(',') if w.strip()]
            
            # Get proxy and user agent with fallbacks
            try:
                proxy = get_proxy(self.config)
            except Exception as e:
                print(f"Failed to get proxy: {e}")
                proxy = "127.0.0.1:8080"  # Fallback proxy
            
            useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            
            print(f"Fetching content for {display_link} using user agent: {useragent}")
            
            # Fetch content using default scraper
            content, test_result = self._fetch_content_simple(url, display_link, useragent, proxy)
            
            print(f"Fetched content for {display_link} with status: {test_result}")
            
            # Analyze content
            analysis_result = self._analyze_content(
                content, test_result, check_http_code, codes_to_check,
                check_stop_words, stop_words_to_check, homepage_redirect,
                check_stop_words_on_page, display_link, global_settings, False, url, False
            )
            print(f"Analysis result for {display_link}: {analysis_result}")
            
            return {
                'url': url,
                'display_link': display_link,
                'status': analysis_result['status'],
                'content': content,
                'test_result': test_result,
                'analysis': analysis_result,
                'settings': {
                    'check_http_code': check_http_code,
                    'http_codes': http_codes,
                    'check_stop_words': check_stop_words,
                    'stop_words': stop_words,
                    'homepage_redirect': homepage_redirect,
                    'check_stop_words_on_page': check_stop_words_on_page,
                    'use_selenium': use_selenium,
                    'use_playwright': use_playwright
                },
                'proxy': proxy,
                'useragent': useragent
            }
            
        except Exception as e:
            print(f"Error in _check_link_without_database: {e}")
            import traceback
            traceback.print_exc()
            # Return a basic error result
            return {
                'url': url,
                'display_link': display_link,
                'status': 'error',
                'content': '',
                'test_result': ['500', 'null', ['500']],
                'analysis': {
                    'status': 'error',
                    'reason': f'Database connection failed: {e}',
                    'is_deleted': False,
                    'http_code': '500',
                    'redirect_url': 'null',
                    'all_codes': ['500']
                },
                'settings': {},
                'proxy': '127.0.0.1:8080',
                'useragent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
    
    def _fetch_content_simple(self, url, display_link, useragent, proxy):
        """
        Simple content fetching without database dependencies
        
        Args:
            url (str): URL to fetch
            display_link (str): Display link
            useragent (str): User agent string
            proxy (str): Proxy string
        
        Returns:
            tuple: (content, test_result)
        """
        try:
            print("Using simple scraper with curl_cffi fallback for content fetching")
            # Use scraper first with 30-second timeout
            proxy_to_use = "http://" + proxy
            link_id = str(int(time.time() * 1000)) + '.txt'
            command = f'{self.config["paths"]["scraper_path"]} "{url}" {proxy_to_use} "{useragent}" {link_id}'
            
            start_time = time.time()
            content = get_content(command, link_id, timeout=30)
            elapsed_time = time.time() - start_time
            
            # Check if scraper timed out
            if content == "TIMEOUT":
                print(f"Scraper timed out after 30 seconds, trying curl_cffi")
                try:
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    print(f"curl_cffi completed in {elapsed_time:.2f} seconds after scraper timeout")
                    # Check for home redirect detection
                    if curl_result[3]:  # redirect_detected flag
                        return curl_result[1], ['300', curl_result[2], ['300']]
                    else:
                        return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
                except Exception as e:
                    print(f"curl_cffi also failed after scraper timeout: {e}")
                    return "", ['500', 'null', ['500']]
            
            test_result = test_content(content)
            print(f"Scraper completed in {elapsed_time:.2f} seconds")
            
            # Check if scraper returned code 206, if so retry with curl_cffi first, then Selenium
            if test_result[0] == '206':
                print(f"Scraper returned code 206, trying curl_cffi first")
                try:
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    if curl_result[0] in ['200', '300']:
                        print(f"curl_cffi succeeded where scraper failed")
                        # Check for home redirect detection
                        if curl_result[3]:  # redirect_detected flag
                            return curl_result[1], ['300', curl_result[2], ['300']]
                        else:
                            return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
                    else:
                        print(f"curl_cffi also failed, trying Selenium")
                        content_parts = selenium_content(useragent, url, proxy, False, self.config)  # cloudflare=False for simple mode
                        return content_parts[1], [content_parts[0], 'null', ['200']]
                except Exception as e:
                    print(f"curl_cffi failed: {e}, trying Selenium")
                    try:
                        content_parts = selenium_content(useragent, url, proxy, False, self.config)  # cloudflare=False for simple mode
                        return content_parts[1], [content_parts[0], 'null', ['200']]
                    except Exception as e2:
                        print(f"Selenium retry also failed: {e2}, returning original scraper result")
                        return content, test_result
            
            # For 500 errors or other failures, try curl_cffi
            if test_result[0] in ['500', '403', '429', '503']:
                print(f"Scraper returned {test_result[0]}, trying curl_cffi")
                try:
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    if curl_result[0] in ['200', '300']:
                        print(f"curl_cffi succeeded where scraper failed")
                        # Check for home redirect detection
                        if curl_result[3]:  # redirect_detected flag
                            return curl_result[1], ['300', curl_result[2], ['300']]
                        else:
                            return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
                    else:
                        print(f"curl_cffi also returned {curl_result[0]}, using original scraper result")
                        return content, test_result
                except Exception as e:
                    print(f"curl_cffi fallback failed: {e}, using original scraper result")
                    return content, test_result
            
            return content, test_result
        except Exception as e:
            print(f"Error in simple content fetching: {e}")
            # Try curl_cffi as last resort
            try:
                print("Trying curl_cffi as last resort")
                curl_result = curl_cffi_content(url, useragent, proxy)
                # Check for home redirect detection
                if curl_result[3]:  # redirect_detected flag
                    return curl_result[1], ['300', curl_result[2], ['300']]
                else:
                    return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
            except Exception as e2:
                print(f"curl_cffi last resort also failed: {e2}")
                return "", ['500', 'null', ['500']]
    
    def _fetch_content(self, url, display_link, useragent, proxy, cloudflare, use_selenium, use_playwright):
        """
        Fetch content from URL using appropriate method
        
        Args:
            url (str): URL to fetch
            display_link (str): Display link
            useragent (str): User agent string
            proxy (str): Proxy string
            cloudflare (bool): Whether to handle Cloudflare
            use_selenium (bool): Whether to use Selenium
            use_playwright (bool): Whether to use Playwright
        
        Returns:
            tuple: (content, test_result)
        """
        print(f"Fetching content for {display_link} with settings: selenium={use_selenium}, playwright={use_playwright}")
        
        # Handle social media platforms
        if 'instagram' in display_link:
            print("Using Instagram handler")
            insta_user = get_cookies(self.cur, 'insta_user')
            if insta_user and int(insta_user[0]['status']) == 1:
                content = process_instagram(useragent, url, insta_user[0], self.config)
                return content, ['200', 'null', ['200']]
            else:
                print("Instagram user not available or disabled, falling through to next handler")
                return "Instagram user not available or disabled, falling through to next handler", ['200', 'null', ['200']]
        
        elif 'facebook' in display_link:
            print("Using Facebook handler")
            fb_user = get_cookies(self.cur, 'facebook_user')
            if fb_user and int(fb_user[0]['status']) == 1:
                content = process_facebook(useragent, self.cur, url, fb_user[0], self.config)
                return content, ['200', 'null', ['200']]
            else:
                print("Facebook user not available or disabled, falling through to next handler")
                return "Facebook user not available or disabled, falling through to next handler", ['200', 'null', ['200']]

        elif 'tiktok' in display_link:
            print("Using TikTok handler")
            tiktok_user = get_cookies(self.cur, 'tiktok_user')
            if tiktok_user and int(tiktok_user[0]['status']) == 1:
                content = process_tiktok(useragent, url, tiktok_user[0], self.config)
                return content, ['200', 'null', ['200']]
            else:
                print("TikTok user not available or disabled, falling through to next handler")
                return "TikTok user not available or disabled, falling through to next handler", ['200', 'null', ['200']]

        elif 'reddit' in url:
            print("Using Reddit handler")
            reddit_user = get_cookies(self.cur, 'reddit_user')
            if reddit_user and int(reddit_user[0]['status']) == 1:
                content = process_reddit(useragent, url, reddit_user[0], self.config)
                return content, ['200', 'null', ['200']]
            else:
                print("Reddit user not available or disabled, falling through to next handler")
                return "Reddit user not available or disabled, falling through to next handler", ['200', 'null', ['200']]


        # Handle special cases
        elif 'kinogoby.zone' in url or 'films' in url:
            print("Using Playwright for kinogoby.zone/films")
            content_parts = playwright_content(useragent, url, self.cur, self.con, proxy, cloudflare, self.config)
            return content_parts[1], [content_parts[0], content_parts[2], ['200']]
        
        elif 'turbobit.net' in url:
            print("Using Selenium for turbobit.net")
            # Use selenium wait content for turbobit
            content_parts = selenium_content(useragent, url, proxy, cloudflare, self.config)
            return content_parts[1], ['200', 'null', ['200']]
        
        # Use configured method
        elif use_playwright:
            print("Using Playwright (configured)")
            content_parts = playwright_content(useragent, url, self.cur, self.con, proxy, cloudflare, self.config)
            return content_parts[1], [content_parts[0], content_parts[2], ['200']]
        
        elif use_selenium:
            print("Using Selenium (configured)")
            content_parts = selenium_content(useragent, url, proxy, cloudflare, self.config)
            return content_parts[1], [content_parts[0], 'null', ['200']]
        
        else:
            print("Using default scraper with curl_cffi fallback")
            # Use scraper first with 30-second timeout
            proxy_to_use = "http://" + proxy
            link_id = str(int(time.time() * 1000)) + '.txt'
            command = f'{self.config["paths"]["scraper_path"]} "{url}" {proxy_to_use} "{useragent}" {link_id}'
            
            start_time = time.time()
            content = get_content(command, link_id, timeout=30)
            elapsed_time = time.time() - start_time
            
            # Check if scraper timed out
            if content == "TIMEOUT":
                print(f"Scraper timed out after 30 seconds, trying curl_cffi")
                try:
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    print(f"curl_cffi completed in {elapsed_time:.2f} seconds after scraper timeout")
                    # Check for home redirect detection
                    if curl_result[3]:  # redirect_detected flag
                        return curl_result[1], ['300', curl_result[2], ['300']]
                    else:
                        return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
                except Exception as e:
                    print(f"curl_cffi also failed after scraper timeout: {e}")
                    return "", ['500', 'null', ['500']]
            
            test_result = test_content(content)
            print(f"Scraper completed in {elapsed_time:.2f} seconds")
            
            # Check if scraper returned code 206, if so retry with Selenium
            if test_result[0] == '206':
                print(f"Scraper returned code 206, retrying with Selenium")
                try:
                    content_parts = selenium_content(useragent, url, proxy, cloudflare, self.config)
                    return content_parts[1], [content_parts[0], 'null', ['200']]
                except Exception as e:
                    print(f"Selenium retry failed: {e}, trying curl_cffi")
                    # Try curl_cffi as fallback
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    # Check for home redirect detection
                    if curl_result[3]:  # redirect_detected flag
                        return curl_result[1], ['300', curl_result[2], ['300']]
                    else:
                        return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
            
            # Check if scraper returned 200 or 300 status codes - use as is
            if test_result[0] in ['200', '300']:
                print(f"Scraper returned {test_result[0]}, also trying curl_cffi for comparison")
                try:
                    # Try curl_cffi alongside scraper for better results
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    
                    # If curl_cffi returns 200/300 and has more content, prefer it
                    if curl_result[0] in ['200', '300'] and len(curl_result[1]) > len(content):
                        print(f"curl_cffi returned better content ({len(curl_result[1])} vs {len(content)} chars)")
                        # Check for home redirect detection from curl_cffi
                        if curl_result[3]:  # redirect_detected flag
                            return curl_result[1], ['300', curl_result[2], ['300']]
                        else:
                            return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
                    else:
                        print(f"Using original scraper result")
                        return content, test_result
                except Exception as e:
                    print(f"curl_cffi fallback failed: {e}, using original scraper result")
                    return content, test_result
            
            # Handle redirects and 500 errors
            if '302' in test_result[2] or '500' in test_result[2] or '300' in test_result[2] or test_result[0] == '500':
                print(f"Detected redirect/500 error in scraper result: {test_result[2]}, trying curl_cffi")
                try:
                    # Try curl_cffi first for 500 errors (it has retry logic)
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    if curl_result[0] in ['200', '300']:
                        print(f"curl_cffi succeeded where scraper failed")
                        # Check for home redirect detection
                        if curl_result[3]:  # redirect_detected flag
                            return curl_result[1], ['300', curl_result[2], ['300']]
                        else:
                            return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
                    else:
                        print(f"curl_cffi also failed, trying Playwright")
                        content_parts = playwright_content(useragent, url, self.cur, self.con, proxy, cloudflare, self.config)
                        return content_parts[1], [content_parts[0], content_parts[2], ['200']]
                except Exception as e:
                    print(f"curl_cffi failed: {e}, trying Playwright")
                    try:
                        content_parts = playwright_content(useragent, url, self.cur, self.con, proxy, cloudflare, self.config)
                        return content_parts[1], [content_parts[0], content_parts[2], ['200']]
                    except Exception as e2:
                        print(f"Playwright also failed: {e2}, returning original scraper result")
                        return content, test_result
            
            # Check if scraper might have hit Cloudflare protection (certain HTTP codes)
            # and try curl_cffi first, then Aparser as fallback for Cloudflare websites
            if test_result[0] in ['403', '429', '503']:
                try:
                    print(f"Scraper returned {test_result[0]}, trying curl_cffi first")
                    curl_result = curl_cffi_content(url, useragent, proxy)
                    if curl_result[0] in ['200', '300'] and len(curl_result[1]) > 50:
                        print(Fore.GREEN + "curl_cffi successfully bypassed protection" + Style.RESET_ALL)
                        # Check for home redirect detection
                        if curl_result[3]:  # redirect_detected flag
                            return curl_result[1], ['300', curl_result[2], ['300']]
                        else:
                            return curl_result[1], [curl_result[0], curl_result[2], [curl_result[0]]]
                    elif cloudflare:
                        print(Fore.YELLOW + f"curl_cffi failed for Cloudflare website, trying Aparser" + Style.RESET_ALL)
                        aparser_content = AparserLinkExtractor(url)
                        if aparser_content and len(aparser_content) > 50:
                            print(Fore.GREEN + "Aparser successfully extracted content from scraper fallback" + Style.RESET_ALL)
                            return aparser_content, ['200', 'null', ['200']]
                        else:
                            print(Fore.RED + "Aparser failed for scraper fallback, returning original result" + Style.RESET_ALL)
                            return content, test_result
                    else:
                        print(Fore.RED + "curl_cffi failed for non-Cloudflare website, returning original result" + Style.RESET_ALL)
                        return content, test_result
                except Exception as e:
                    print(Fore.RED + f"curl_cffi fallback failed: {e}, returning original scraper result" + Style.RESET_ALL)
                    return content, test_result
            
            return content, test_result
    
    def _analyze_content(self, content, test_result, check_http_code, codes_to_check,
                        check_stop_words, stop_words_to_check, homepage_redirect,
                        check_stop_words_on_page, display_link, global_settings, is_cloudflare, original_url=None, website_has_settings=False):
        """
        Analyze fetched content
        
        Args:
            content (str): Fetched content
            test_result (list): Test result from content fetching
            check_http_code (bool): Whether to check HTTP codes
            codes_to_check (list): HTTP codes to check
            check_stop_words (bool): Whether to check stop words
            stop_words_to_check (list): Stop words to check
            homepage_redirect (bool): Whether to check homepage redirect
            check_stop_words_on_page (bool): Whether to check stop words on page
            display_link (str): Display link
            global_settings (dict): Global settings
            is_cloudflare (bool): Whether website is cloudflare
        
        Returns:
            dict: Analysis result
        """
        code = test_result[0]
        res_url = test_result[1]
        codes = test_result[2]
        
        analysis = {
            'status': 'active',
            'reason': '',
            'is_deleted': False,
            'http_code': code,
            'redirect_url': res_url,
            'all_codes': codes
        }
        
        # Check HTTP codes - website settings first, then global
        if check_http_code and codes_to_check:
            if code == '401':
                code = '403'
            if code in codes_to_check:
                analysis['status'] = 'deleted'
                analysis['reason'] = f"HTTP code {code} found (website configured)"
                analysis['is_deleted'] = True
                return analysis
        elif not website_has_settings and global_settings and global_settings.get('http_code'):
            # Only check global HTTP codes if website doesn't have ANY settings configured
            global_http_codes = global_settings['http_code']
            if code in global_http_codes:
                analysis['status'] = 'deleted'
                analysis['reason'] = f"HTTP code {code} found (global)"
                analysis['is_deleted'] = True
                return analysis
        
        # Check global skip codes
        if global_settings and global_settings.get('skip_code'):
            skip_codes = global_settings['skip_code']
            for skip_item in skip_codes:
                if isinstance(skip_item, dict):
                    skip_code = skip_item['code']
                    cloudflare_only = skip_item['cloudflare'] == 1
                    
                    # Check if this skip code applies
                    if code == skip_code:
                        # If cloudflare_only is True, only skip for cloudflare websites
                        if cloudflare_only and is_cloudflare:
                            analysis['status'] = 'skipped'
                            analysis['reason'] = f"Skip code {code} found (global)"
                            analysis['is_deleted'] = False
                            return analysis
                        # elif cloudflare_only and not is_cloudflare:
                        #     analysis['status'] = 'skipped'
                        #     analysis['reason'] = f"Skip code {code} found (global)"
                        #     analysis['is_deleted'] = True
                        #     return analysis

                # else:
                #     # Backward compatibility for string values
                #     if code == skip_item:
                #         analysis['status'] = 'deleted'
                #         analysis['reason'] = f"Skip code {code} found (global)"
                #         analysis['is_deleted'] = True
                #         return analysis
        
        # Check for specific status codes (hardcoded but essential)
        # if code in ['404', '410', '451']:
        #     analysis['status'] = 'deleted'
        #     analysis['reason'] = f"HTTP code {code} found"
        #     analysis['is_deleted'] = True
        #     return analysis
        
        # Check for redirects - only if homepage_redirect is enabled
        if homepage_redirect:
            # Check if redirect was detected (code 300) or if final URL is homepage
            is_homepage_redirect = False
            
            if code == '300':
                # Explicit redirect detected
                is_homepage_redirect = True
            elif original_url and res_url and res_url != 'null':
                # Check if final URL is a homepage (regardless of domain)
                try:
                    original_parsed = urlparse(original_url)
                    final_parsed = urlparse(str(res_url))
                    
                    # Check if original had a path (not homepage) and final is homepage
                    # This works for same-domain and cross-domain redirects
                    original_has_path = len(original_parsed.path) > 2
                    final_is_homepage = (final_parsed.path == '/' or len(final_parsed.path) < 2)
                    
                    # If original had a path and final is homepage, it's a homepage redirect
                    if original_has_path and final_is_homepage:
                        is_homepage_redirect = True
                        domain_info = "cross-domain" if original_parsed.netloc != final_parsed.netloc else "same-domain"
                        print(f"Homepage redirect detected ({domain_info}): {original_url} -> {res_url} (code: {code})")
                except Exception as e:
                    print(f"Error parsing URLs for redirect check: {e}")
            
            if is_homepage_redirect:
                analysis['status'] = 'deleted'
                analysis['reason'] = 'Redirect to homepage (enabled in settings)'
                analysis['is_deleted'] = True
                return analysis
        
        # Check for maintenance or onion redirects
        if code in ['205', '206']:
            analysis['status'] = 'maintenance'
            analysis['reason'] = 'Site maintenance' if code == '205' else 'Onion redirect'
            return analysis
        
        
        
        # Parse content for stop words
        if len(content) < 10:
            analysis['status'] = 'error'
            analysis['reason'] = 'Very little content'
            return analysis
        
        try:
            soup = BeautifulSoup(content, 'html.parser')
        except Exception:
            analysis['status'] = 'error'
            analysis['reason'] = 'BS4 parsing error'
            return analysis
        
        # Prepare content for stop word checking
        # If check_stop_words_on_page is True: only check visible text (no script tags)
        # If check_stop_words_on_page is False: check raw HTML including script tags
        if check_stop_words_on_page:
            content_for_stop_words = soup.text  # Only visible text, no script tags
        else:
            content_for_stop_words = content  # Raw HTML including script tags
        
        # Check stop words - website settings first, then global
        if check_stop_words and stop_words_to_check:
            for stop_word in stop_words_to_check:
                # Escape special regex characters to treat stop word as literal string
                escaped_pattern = re.escape(stop_word.strip())
                # Use DOTALL flag to allow . to match newlines, and IGNORECASE for case-insensitive matching
                if re.search(escaped_pattern, content_for_stop_words, flags=re.IGNORECASE | re.DOTALL):
                    analysis['status'] = 'deleted'
                    analysis['reason'] = f"Stop word found: {stop_word} (website configured)"
                    analysis['is_deleted'] = True
                    return analysis
        elif not website_has_settings and global_settings and global_settings.get('stopword'):
            # Only check global stop words if website doesn't have ANY settings configured
            global_stop_words = global_settings['stopword']
            for stop_word in global_stop_words:
                # Escape special regex characters to treat stop word as literal string
                escaped_pattern = re.escape(stop_word.strip())
                # Use DOTALL flag to allow . to match newlines, and IGNORECASE for case-insensitive matching
                if re.search(escaped_pattern, content_for_stop_words, flags=re.IGNORECASE | re.DOTALL):
                    analysis['status'] = 'deleted'
                    analysis['reason'] = f"Stop word found: {stop_word} (global)"
                    analysis['is_deleted'] = True
                    return analysis
        
        # Check deleted keywords
        try:
            if not self._ensure_database_connection():
                print("Database connection lost during deleted keywords check, skipping")
                deleted_keywords = []
            else:
                deleted_keywords = get_deleted_keywords(self.cur)
        except Exception as e:
            print(f"Failed to get deleted keywords: {e}")
            deleted_keywords = []
        
        for keyword in deleted_keywords:
            # Escape special regex characters to treat keyword as literal string
            escaped_pattern = re.escape(keyword.strip())
            # Use DOTALL flag to allow matching across newlines, and IGNORECASE for case-insensitive matching
            if re.search(escaped_pattern, content_for_stop_words, flags=re.IGNORECASE | re.DOTALL):
                analysis['status'] = 'deleted'
                analysis['reason'] = f"Deleted keyword found: {keyword}"
                analysis['is_deleted'] = True
                return analysis
        
        # Check for upload sites
        try:
            if not self._ensure_database_connection():
                print("Database connection lost during upload sites check, skipping")
            else:
                sql = "SELECT * FROM upload_sites WHERE hostname = (%s)"
                self.cur.execute(sql, [display_link])
                if self.cur.rowcount:
                    analysis['reason'] += "Upload hosts link\n"
        except Exception as e:
            print(f"Failed to check upload sites: {e}")
        
        # Check for too many requests
        if 'Too many requests from your IP' in content or 'Too many requests' in content:
            analysis['status'] = 'rate_limited'
            analysis['reason'] = 'Too many requests from IP'
            return analysis
        
        analysis['reason'] += "No stop words found\n"
        return analysis
    
    def get_project_info(self, project_id):
        """
        Get project information
        
        Args:
            project_id (int): Project ID
        
        Returns:
            dict: Project information
        """
        try:
            if not self._ensure_database_connection():
                print("Database connection lost during project info fetch")
                return None
            return get_project(self.cur, project_id)
        except Exception as e:
            print(f"Failed to get project info: {e}")
            return None
        
    def get_score_threshold(self, search_phrase):
        default_title = 50
        default_author = 50
        config = 'project_title_score'
        if 'author' in search_phrase.lower():
            config = 'project_author_score'
        sql = "SELECT value FROM tse_configs WHERE component='FWS' AND config=%s"
        self.cur.execute(sql, [config])
        row = self.cur.fetchone()
        if row and row['value'] is not None:
            try:
                return float(row['value'])
            except Exception:
                pass
        return default_author if config == 'project_author_score' else default_title

    def search(self, link, search_phrase, search_term, search_index):
        max_score = 0
        total_hits = 0
        print(Fore.GREEN + "Checking " + search_phrase + " .." + link)
        print(Style.RESET_ALL)

        try:
            json_data = {
                "index": search_index,
                "api_key": "aecaa0fe-4673-436d-a798-1601cf593f64"
            }

            r = requests.post('http://127.0.0.1:5000/search', json=json_data)
            print('Search: ', r.status_code)
            if r.status_code != 200:
                print(Fore.RED + "Bad status code ")
                print(Style.RESET_ALL)
                return True
            data = r.json()
            print(data)
            if 'score' in data:
                max_score = data['score']
                print(Fore.YELLOW)
                print("Score: " + str(max_score) + " " + link)
                print(Style.RESET_ALL)
            threshold = self.get_score_threshold(search_phrase)
            if float(max_score) > float(threshold):
                return True
        except BaseException as e:
            print("Error: " + str(e))
            return True
    
    def check_title_on_page(self, url, project_id, content):
        """
        Check if project title is found on page
        
        Args:
            url (str): URL to check
            project_id (int): Project ID
            content (str): Page content
        
        Returns:
            bool: True if title found, False otherwise
        """
        try:
            if not self._ensure_database_connection():
                print("Database connection lost during title check")
                return False
            project = get_project(self.cur, project_id)
            title = project['project_title']
            translate = str(project['project_title_eng'])

            project_title_score = get_project_title_score(self.cur)

            try:
                # partial_ratio works best for finding phrase inside longer text
                score = fuzz.partial_ratio(title, content)
                results = int(score) > project_title_score
            except Exception as e:
                print(f"Error calculating similarity: {e}")
                results = True
                
            if results:
                print(Fore.YELLOW + "******* Found title " + title + " on " + url)
                print(Style.RESET_ALL)
                return True
            
            if translate and len(str(translate)) > 4:
                try:
                    # partial_ratio works best for finding phrase inside longer text
                    score = fuzz.partial_ratio(translate, content)
                    res = int(score) > project_title_score
                except Exception as e:
                    print(f"Error calculating similarity: {e}")
                    res = True
                if res:
                    print(Fore.YELLOW + "******* Found translate " + translate + " on " + url)
                    print(Style.RESET_ALL)
                    return True

            return False
            
            # This would need Solr integration for full functionality
            # For now, return True to indicate title checking is available
            return True
        except Exception as e:
            print(f"Failed to check title on page: {e}")
            return True
