diff --git a/Downloader.py b/Downloader.py new file mode 100644 index 0000000..642da5a --- /dev/null +++ b/Downloader.py @@ -0,0 +1,164 @@ +import os +import requests +import multiprocessing +from bs4 import BeautifulSoup +from urllib.parse import unquote +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import aiohttp +import asyncio + +MAX_RETRIES = 2 + +url = input("Enter the URL: ") + +# Fetch HTML content from the specified URL +response = requests.get(url) +html_content = response.text + +# Parse the HTML content +soup = BeautifulSoup(html_content, 'lxml') + +# Find all elements with class 'playlistDownloadSong' +elements = soup.find_all(class_='playlistDownloadSong') + +# Store URLs in a list +urls = [] +for index, element in enumerate(elements): + link = element.find('a') + if link: + url = link.get('href') + full_url = f'https://downloads.khinsider.com{url}' + urls.append(full_url) + +# List to store failed URLs +failed_urls = [] + +# Lock to prevent concurrent printing of error messages +print_lock = asyncio.Lock() + + +# Function to fetch HTML content asynchronously +async def async_get_html_content(url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=headers) as response: + response.raise_for_status() + return await response.text() + except aiohttp.ClientError as e: + return None + + +# Function to find and save FLAC, MP3, or M4A URLs and album name +def find_audio_urls_and_album_name(html_content): + audio_urls = [] + album_name = None + + if html_content: + soup = BeautifulSoup(html_content, "html.parser") + + # Find all links in the page + links = soup.find_all("a", href=True) + for link in links: + href = link.get("href") + if href.endswith(".flac"): + audio_url = href + audio_urls = [audio_url] + break + elif href.endswith(".mp3") or href.endswith(".m4a"): + audio_urls.append(href) + + # Extract album name + album_name_element = soup.select_one("#pageContent > p:nth-child(6) > b:nth-child(1)") + if album_name_element: + album_name = album_name_element.text.strip() + + return audio_urls, album_name + + +# Function to download a file asynchronously with retry +async def async_download_audio_file(session, url, directory, total_progress): + retries = 0 + + while retries <= MAX_RETRIES: + try: + async with session.get(url) as response: + response.raise_for_status() + content = await response.read() + + filename = unquote(os.path.join(directory, os.path.basename(url))) + + with open(filename, 'wb') as file: + file.write(content) + + total_progress.update(1) + break # Break the loop if download is successful + except Exception as e: + retries += 1 + if retries <= MAX_RETRIES: + await asyncio.sleep(2) # Wait for a moment before retrying + else: + async with print_lock: + failed_urls.append(url) + break # Break the loop if max retries reached + + +# Function to process a single URL asynchronously +async def async_process_url(session, url, total_progress): + html_content = await async_get_html_content(url) + audio_urls, album_name = find_audio_urls_and_album_name(html_content) + + if audio_urls and album_name: + sanitized_album_name = "".join(c if c.isalnum() or c in [' ', '-', '_'] else '' for c in album_name) + album_directory = os.path.join('Audio files', sanitized_album_name) + os.makedirs(album_directory, exist_ok=True) + + for audio_url in audio_urls: + await async_download_audio_file(session, audio_url, album_directory, total_progress) + else: + pass # No audio files found for the URL + + +def get_cpu_threads(): + try: + num_threads = os.cpu_count() or 1 + except NotImplementedError: + num_threads = multiprocessing.cpu_count() or 1 + + return num_threads + + +async def main(): + cpu_threads = get_cpu_threads() + + async with aiohttp.ClientSession() as session: + with ThreadPoolExecutor(max_workers=cpu_threads) as executor: + total_items = len(urls) + total_progress = tqdm(total=total_items, desc="Total Progress", position=0) + + futures = [] + loop = asyncio.get_event_loop() + + for url in urls: + # Corrected: use loop.create_task() to ensure the coroutine is awaited properly + future = loop.create_task(async_process_url(session, url, total_progress)) + futures.append(future) + + # Await all the futures + await asyncio.gather(*futures) + + total_progress.close() + + # Display error messages for failed URLs after the download is complete + if failed_urls: + print("\nThe following files encountered errors during download:") + for failed_url in failed_urls: + print(f"- {failed_url}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/README.md b/README.md index 9fcb2aa..c38f976 100644 --- a/README.md +++ b/README.md @@ -13,54 +13,28 @@ Install the required python libraries by downloading the requirements.txt found pip install -r requirements.txt ``` Start the program using Python and enter the album URL of the website by running -Not all albums have mp3 or flac, If one does not work try the other ``` -python downloader-flac.py -``` -or -``` -python downloader-mp3.py +python Downloader.py ``` ## Info -The downloaded files will be in either "MP3 files" or "FLAC files". The folders will be created where the python script is located. +The script tries to find FLAC, MP3 and M4A files. It prioritizes FLAC files but if there isnt then it tries MP3 and if both are misisng then M4A. +The downloaded files will be "Audio files". By default the folder will be created where the python script is located. ## Custom path download - -
- MP3 +Find "album_directory = os.path.join('Audio files', sanitized_album_name)" and replace it with the following code ### Windows -Replace "album_directory = os.path.join('MP3 files', sanitized_album_name)" with ``` -base_directory = 'C:\\your\\custom\\path' -album_directory = os.path.join(base_directory, 'MP3 files', sanitized_album_name) +album_directory = os.path.join(r'C:\your\custom\path', sanitized_album_name) ``` ### Linux -Replace "album_directory = os.path.join('MP3 files', sanitized_album_name)" with ``` -base_directory = '/your/custom/path' -album_directory = os.path.join(base_directory, 'MP3 files', sanitized_album_name) +album_directory = os.path.join('/your/custom/path', sanitized_album_name) ``` -
-
- FLAC - -### Windows -Replace "album_directory = os.path.join('FLAC files', sanitized_album_name)" with -``` -base_directory = 'C:\\your\\custom\\path' -album_directory = os.path.join(base_directory, 'FLAC files', sanitized_album_name) -``` -### Linux -Replace "album_directory = os.path.join('FLAC files', sanitized_album_name)" with -``` -base_directory = '/your/custom/path' -album_directory = os.path.join(base_directory, 'FLAC files', sanitized_album_name) -``` -
+ diff --git a/downloader-flac.py b/downloader-flac.py deleted file mode 100644 index dc818da..0000000 --- a/downloader-flac.py +++ /dev/null @@ -1,148 +0,0 @@ -import os -import requests -import multiprocessing -from bs4 import BeautifulSoup -from urllib.parse import unquote -from concurrent.futures import ThreadPoolExecutor -from tqdm import tqdm - -url = input("Enter the URL: ") - -# Fetch HTML content from the specified URL -response = requests.get(url) -html_content = response.text - -# Parse the HTML content -soup = BeautifulSoup(html_content, 'html.parser') - -# Find all elements with class 'playlistDownloadSong' -elements = soup.find_all(class_='playlistDownloadSong') - -# Store URLs in a list -urls = [] -for index, element in enumerate(elements): - link = element.find('a') - if link: - url = link.get('href') - full_url = f'https://downloads.khinsider.com{url}' - urls.append(full_url) - - -# Function to fetch and parse HTML content -def get_html_content(url): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - try: - response = requests.get(url, headers=headers) - response.raise_for_status() - return response.text - except requests.exceptions.RequestException as e: - print(f"Error fetching {url}: {e}") - return None - - -# Function to find and save FLAC URLs and album name -def find_flac_urls_and_album_name(html_content): - flac_urls = [] - album_name = None - - if html_content: - soup = BeautifulSoup(html_content, "html.parser") - - # Find all links in the page - links = soup.find_all("a", href=True) - for link in links: - href = link.get("href") - if href.endswith(".flac"): - flac_url = href - flac_urls.append(flac_url) - - # Extract album name - album_name_element = soup.select_one("#pageContent > p:nth-child(6) > b:nth-child(1)") - if album_name_element: - album_name = album_name_element.text.strip() - - return flac_urls, album_name - - -# Function to download a file -def download_file(url, directory, total_progress): - try: - response = requests.get(url, stream=True) - response.raise_for_status() - - # Unquote the filename to convert %20 back to spaces - filename = unquote(os.path.join(directory, os.path.basename(url))) - - with open(filename, 'wb') as file: - for data in response.iter_content(chunk_size=1024): - file.write(data) - total_progress.update(1) # Update the total progress by 1 for each file downloaded - - # print(f"Downloaded: {filename}") - except requests.exceptions.RequestException as e: - print(f"Error downloading {url}: {e}") - - -# Function to process a single URL -def process_url(url, total_progress): - # print(f"Scraping {url} for FLAC files...") - html_content = get_html_content(url) - flac_urls, album_name = find_flac_urls_and_album_name(html_content) - - if flac_urls and album_name: - # Sanitize album name for creating a directory - sanitized_album_name = "".join(c if c.isalnum() or c in [' ', '-', '_'] else '' for c in album_name) - album_directory = os.path.join('FLAC files', sanitized_album_name) - os.makedirs(album_directory, exist_ok=True) - - # print(f"FLAC files found for album '{album_name}':") - for flac_url in flac_urls: - download_file(flac_url, album_directory, total_progress) - else: - # No need to print an error message here - pass - - -def get_cpu_threads(): - try: - # For Linux/Unix/MacOS - num_threads = os.cpu_count() or 1 - except NotImplementedError: - # For Windows - num_threads = multiprocessing.cpu_count() or 1 - - return num_threads - -if __name__ == "__main__": - cpu_threads = get_cpu_threads() - -# Use ThreadPoolExecutor to run the process_url function concurrently -with ThreadPoolExecutor(max_workers=cpu_threads) as executor: - total_items = len(urls) - total_progress = tqdm(total=total_items, desc="Total Progress", position=0) - - futures = [] - for url in urls: - future = executor.submit(process_url, url, total_progress) - futures.append(future) - - # Wait for all futures to complete - for future in futures: - future.result() - - total_progress.close() - -# Display the final message based on the download results -downloaded_files = total_progress.n -error_message = None - -if downloaded_files == 0: - error_message = "Album name missing from site." -elif downloaded_files < total_items: - error_message = f"{total_items - downloaded_files} files not downloaded. Missing FLAC files." - -if error_message: - print(error_message) diff --git a/downloader-mp3.py b/downloader-mp3.py deleted file mode 100644 index 78a068d..0000000 --- a/downloader-mp3.py +++ /dev/null @@ -1,148 +0,0 @@ -import os -import requests -import multiprocessing -from bs4 import BeautifulSoup -from urllib.parse import unquote -from concurrent.futures import ThreadPoolExecutor -from tqdm import tqdm - -url = input("Enter the URL: ") - -# Fetch HTML content from the specified URL -response = requests.get(url) -html_content = response.text - -# Parse the HTML content -soup = BeautifulSoup(html_content, 'html.parser') - -# Find all elements with class 'playlistDownloadSong' -elements = soup.find_all(class_='playlistDownloadSong') - -# Store URLs in a list -urls = [] -for index, element in enumerate(elements): - link = element.find('a') - if link: - url = link.get('href') - full_url = f'https://downloads.khinsider.com{url}' - urls.append(full_url) - - -# Function to fetch and parse HTML content -def get_html_content(url): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - try: - response = requests.get(url, headers=headers) - response.raise_for_status() - return response.text - except requests.exceptions.RequestException as e: - print(f"Error fetching {url}: {e}") - return None - - -# Function to find and save MP3 URLs and album name -def find_mp3_urls_and_album_name(html_content): - mp3_urls = [] - album_name = None - - if html_content: - soup = BeautifulSoup(html_content, "html.parser") - - # Find all links in the page - links = soup.find_all("a", href=True) - for link in links: - href = link.get("href") - if href.endswith(".mp3"): - mp3_url = href - mp3_urls.append(mp3_url) - - # Extract album name - album_name_element = soup.select_one("#pageContent > p:nth-child(6) > b:nth-child(1)") - if album_name_element: - album_name = album_name_element.text.strip() - - return mp3_urls, album_name - - -# Function to download a file -def download_file(url, directory, total_progress): - try: - response = requests.get(url, stream=True) - response.raise_for_status() - - # Unquote the filename to convert %20 back to spaces - filename = unquote(os.path.join(directory, os.path.basename(url))) - - with open(filename, 'wb') as file: - for data in response.iter_content(chunk_size=1024): - file.write(data) - total_progress.update(1) # Update the total progress by 1 for each file downloaded - - # print(f"Downloaded: {filename}") - except requests.exceptions.RequestException as e: - print(f"Error downloading {url}: {e}") - - -# Function to process a single URL -def process_url(url, total_progress): - # print(f"Scraping {url} for MP3 files...") - html_content = get_html_content(url) - mp3_urls, album_name = find_mp3_urls_and_album_name(html_content) - - if mp3_urls and album_name: - # Sanitize album name for creating a directory - sanitized_album_name = "".join(c if c.isalnum() or c in [' ', '-', '_'] else '' for c in album_name) - album_directory = os.path.join('MP3 files', sanitized_album_name) - os.makedirs(album_directory, exist_ok=True) - - # print(f"MP3 files found for album '{album_name}':") - for mp3_url in mp3_urls: - download_file(mp3_url, album_directory, total_progress) - else: - # No need to print an error message here - pass - - -def get_cpu_threads(): - try: - # For Linux/Unix/MacOS - num_threads = os.cpu_count() or 1 - except NotImplementedError: - # For Windows - num_threads = multiprocessing.cpu_count() or 1 - - return num_threads - -if __name__ == "__main__": - cpu_threads = get_cpu_threads() - -# Use ThreadPoolExecutor to run the process_url function concurrently -with ThreadPoolExecutor(max_workers=cpu_threads) as executor: - total_items = len(urls) - total_progress = tqdm(total=total_items, desc="Total Progress", position=0) - - futures = [] - for url in urls: - future = executor.submit(process_url, url, total_progress) - futures.append(future) - - # Wait for all futures to complete - for future in futures: - future.result() - - total_progress.close() - -# Display the final message based on the download results -downloaded_files = total_progress.n -error_message = None - -if downloaded_files == 0: - error_message = "Album name missing from site." -elif downloaded_files < total_items: - error_message = f"{total_items - downloaded_files} files not downloaded. Missing MP3 files." - -if error_message: - print(error_message) diff --git a/requirements.txt b/requirements.txt index c85e329..927d706 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -beautifulsoup4==4.9.3 -requests==2.25.1 -tqdm==4.56.0 +beautifulsoup4==4.10.0 +requests==2.26.0 +tqdm==4.62.3 +aiohttp==3.8.6 +lxml==5.1.0 \ No newline at end of file