diff --git a/README.md b/README.md index 5b7dc69..c305765 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ A free program that extracts hard coded subtitles from a video and generates an - Multiple languages supported through [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR). They will be automatically downloaded as needed. -Generated subtitles can be translated with this [script](https://github.com/voun7/Subtitle_Translator). +### Generated subtitles can be translated with this [script](https://github.com/voun7/Subtitle_Translator). **Supported languages** @@ -73,6 +73,20 @@ installed. The program will not start without it. Install packages +For GPU + +``` +pip install onnxruntime-gpu[cuda,cudnn]==1.21.0 +``` + +For CPU + +``` +pip install onnxruntime==1.21.0 +``` + +Other packages + ```commandline pip install -r requirements.txt ``` @@ -83,3 +97,4 @@ Run `gui.py` to use Graphical interface and `main.py` to use Terminal. Run `compiler.py` to build compiled program +For virus warning the file can be submitted [here](https://www.microsoft.com/en-us/wdsi/filesubmission) for analysis. diff --git a/compiler.py b/compiler.py index e0ac1f4..32eab4c 100644 --- a/compiler.py +++ b/compiler.py @@ -85,8 +85,10 @@ def delete_dist_dir() -> None: def main() -> None: start_time = perf_counter() + uninstall_package("onnxruntime-gpu") # not supported by nuitka + install_package("onnxruntime==1.21.0") install_requirements() - install_package("Nuitka==2.6.7") + install_package("Nuitka==2.6.9") download_all_models() remove_non_onnx_models() uninstall_package("paddlepaddle") diff --git a/docs/Preferences.md b/docs/Preferences.md index 536e858..95c6aa6 100644 --- a/docs/Preferences.md +++ b/docs/Preferences.md @@ -43,8 +43,8 @@ Text Extraction Batch Size: The number of frames to be extracted by each CPU or Onnx Intra Threads: The number of threads used by Onnx to parallelize the execution within nodes. -OCR CPU Max Processes: The maximum number of CPU cores to be used for extraction of text from the video. Too little or -too many will increase extraction time. +OCR Max Processes: The maximum number of CPU or GPU core/processes to be used for extraction of text from the video. +Too little or too many will increase extraction time. **Note:** Onnx Intra Threads & OCR CPU Max Processes will require some testing with different values to determine the optimal values. diff --git a/docs/images/text extract.png b/docs/images/text extract.png index 4d9e88a..bc3d136 100644 Binary files a/docs/images/text extract.png and b/docs/images/text extract.png differ diff --git a/gui.py b/gui.py index d930d7f..423c6e9 100644 --- a/gui.py +++ b/gui.py @@ -14,8 +14,7 @@ from PIL import Image, ImageTk import utilities.utils as utils -from main import SubtitleDetector, SubtitleExtractor -from utilities.frames_to_text import download_models +from main import SubtitleDetector, SubtitleExtractor, setup_ocr from utilities.logger_setup import setup_logging from utilities.win_notify import Notification, Sound @@ -719,10 +718,10 @@ def _set_progress_output(self, text: str) -> None: """ Overwrite progress bar text in text widget, if detected in present and previous line. """ - if " |#" in text or "-| " in text or "it/s" in text: + if " |#" in text or "-| " in text: start, stop = 'end - 1 lines', 'end - 1 lines lineend' previous_line = self.text_output_widget.get(start, stop) - if " |#" in previous_line or "-| " in previous_line or "it/s" in previous_line: + if " |#" in previous_line or "-| " in previous_line: self.clear_output(start, stop) def write_to_output(self, text: str) -> None: @@ -751,15 +750,6 @@ def send_notification(self, title: str, message: str = "") -> None: toast.set_audio(sound, loop=utils.Config.win_notify_loop_sound) toast.show() - def gui_model_download(self) -> None: - """ - Modify the gui to properly display the download of the models. This method should not be run from main thread. - tqdm uses stderr so the download progress texts are rerouted. - """ - sys.stderr.write = self.write_to_output - download_models() # if lang changes, the new lang model will be downloaded. - sys.stderr.write = self.error_message_handler - def _detect_subtitles(self) -> None: """ Detect sub area of videos in the queue and set as new sub area. @@ -768,7 +758,7 @@ def _detect_subtitles(self) -> None: start, use_search_area = time.perf_counter(), utils.Config.use_search_area self.thread_running = True try: - self.gui_model_download() + setup_ocr() start = time.perf_counter() for video in self.video_queue.keys(): if utils.Process.interrupt_process: @@ -819,7 +809,7 @@ def extract_subtitles(self) -> None: logger.info(f"Subtitle Language: {utils.Config.ocr_rec_language}\n") self.thread_running = True try: - self.gui_model_download() + setup_ocr() for video, sub_info in self.video_queue.items(): sub_area, start_frame, stop_frame = sub_info[0], sub_info[1], sub_info[2] start_frame = int(start_frame) if start_frame else start_frame @@ -1116,13 +1106,13 @@ def _text_extraction_tab(self) -> None: width=self.spinbox_size ).grid(column=1, row=1) - ttk.Label(text_extraction_frame, text="OCR CPU Max Processes:").grid(column=0, row=2, pady=self.wgt_y_padding) - self.ocr_cpu_max_processes = tk.IntVar(value=utils.Config.ocr_cpu_max_processes) - self.ocr_cpu_max_processes.trace_add("write", self._set_reset_button) + ttk.Label(text_extraction_frame, text="OCR Max Processes:").grid(column=0, row=2, pady=self.wgt_y_padding) + self.ocr_max_processes = tk.IntVar(value=utils.Config.ocr_max_processes) + self.ocr_max_processes.trace_add("write", self._set_reset_button) ttk.Spinbox( text_extraction_frame, from_=1, to=cpu_count(), - textvariable=self.ocr_cpu_max_processes, + textvariable=self.ocr_max_processes, state="readonly", width=self.spinbox_size ).grid(column=1, row=2) @@ -1284,7 +1274,7 @@ def _set_reset_button(self, *args) -> None: utils.Config.default_frame_extraction_batch_size, utils.Config.default_text_extraction_batch_size, utils.Config.default_onnx_intra_threads, - utils.Config.default_ocr_cpu_max_processes, + utils.Config.default_ocr_max_processes, utils.Config.default_ocr_rec_language, utils.Config.default_text_drop_score, utils.Config.default_line_break, @@ -1309,7 +1299,7 @@ def _set_reset_button(self, *args) -> None: self.frame_extraction_batch_size.get(), self.text_extraction_batch_size.get(), self.onnx_intra_threads.get(), - self.ocr_cpu_max_processes.get(), + self.ocr_max_processes.get(), self.ocr_rec_language.get(), self.text_drop_score.get(), self.line_break.get(), @@ -1371,7 +1361,7 @@ def _reset_settings(self) -> None: # Text extraction settings. self.text_extraction_batch_size.set(utils.Config.default_text_extraction_batch_size) self.onnx_intra_threads.set(utils.Config.default_onnx_intra_threads) - self.ocr_cpu_max_processes.set(utils.Config.default_ocr_cpu_max_processes) + self.ocr_max_processes.set(utils.Config.default_ocr_max_processes) self.ocr_rec_language.set(utils.Config.default_ocr_rec_language) self.text_drop_score.set(utils.Config.default_text_drop_score) self.line_break.set(utils.Config.default_line_break) @@ -1405,7 +1395,7 @@ def _save_settings(self) -> None: # Text extraction settings. utils.Config.keys[2]: self.text_extraction_batch_size.get(), utils.Config.keys[3]: self.onnx_intra_threads.get(), - utils.Config.keys[17]: self.ocr_cpu_max_processes.get(), + utils.Config.keys[17]: self.ocr_max_processes.get(), utils.Config.keys[4]: self.ocr_rec_language.get(), utils.Config.keys[18]: self.text_drop_score.get(), utils.Config.keys[20]: self.line_break.get(), diff --git a/main.py b/main.py index 1a16d9d..cecf6fb 100644 --- a/main.py +++ b/main.py @@ -8,7 +8,7 @@ import cv2 as cv import utilities.utils as utils -from utilities.frames_to_text import extract_bboxes, frames_to_text, download_models +from utilities.frames_to_text import extract_bboxes, frames_to_text, setup_ocr from utilities.logger_setup import setup_logging from utilities.video_to_frames import extract_frames, video_to_frames @@ -470,7 +470,7 @@ def run_extraction(self, video_path: str, sub_area: tuple = None, start_frame: i if __name__ == '__main__': setup_logging() logger.debug("\n\nMain program Started.") - download_models() + setup_ocr() test_se = SubtitleExtractor() test_vid = r"" test_se.run_extraction(test_vid) diff --git a/requirements.txt b/requirements.txt index 4236b41..e58f122 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/utilities/frames_to_text.py b/utilities/frames_to_text.py index d7578e6..89544d3 100644 --- a/utilities/frames_to_text.py +++ b/utilities/frames_to_text.py @@ -10,6 +10,21 @@ logger = logging.getLogger(__name__) +def setup_ocr() -> None: + setup_ocr_device() + download_models() + + +def setup_ocr_device() -> None: + if utils.Config.use_gpu and ort.get_device() == "GPU": + logger.debug("GPU is enabled.") + ort.preload_dlls() + utils.Config.ocr_opts["onnx_providers"] = ["CUDAExecutionProvider", "CPUExecutionProvider"] + else: + logger.debug("GPU is disabled.") + utils.Config.ocr_opts.pop("onnx_providers", None) + + def download_models() -> None: """ Download models if dir does not exist. @@ -74,7 +89,7 @@ def frames_to_text(frame_output: Path, text_output: Path) -> None: file_batches = [files[i:i + batch_size] for i in range(0, len(files), batch_size)] no_batches = len(file_batches) logger.info(f"Starting Multiprocess {prefix} from frames on {device}, Batches: {no_batches}.") - with ThreadPoolExecutor(utils.Config.ocr_cpu_max_processes) as executor: + with ThreadPoolExecutor(utils.Config.ocr_max_processes) as executor: futures = [executor.submit(extract_text, ocr_engine, text_output, files, line_sep) for files in file_batches] for i, f in enumerate(as_completed(futures)): # as each process completes f.result() # Prevents silent bugs. Exceptions raised will be displayed. diff --git a/utilities/test_main.py b/utilities/test_main.py index a7d1cb4..740db92 100644 --- a/utilities/test_main.py +++ b/utilities/test_main.py @@ -4,10 +4,11 @@ os.chdir(Path(__file__).parent.parent) -from main import SubtitleDetector, SubtitleExtractor +from main import SubtitleDetector, SubtitleExtractor, setup_ocr ch_vid = "test files/chinese_vid.mp4" ch_vid_srt = Path("test files/chinese_vid.srt") +setup_ocr() class TestSubtitleDetector(TestCase): diff --git a/utilities/utils.py b/utilities/utils.py index fda306c..b7525b2 100644 --- a/utilities/utils.py +++ b/utilities/utils.py @@ -36,7 +36,7 @@ class Config: "onnx_intra_threads", "ocr_rec_language", "text_similarity_threshold", "min_consecutive_sub_dur_ms", "max_consecutive_short_durs", "min_sub_duration_ms", "split_start", "split_stop", "no_of_frames", "sub_area_x_rel_padding", "sub_area_y_abs_padding", "use_search_area", "win_notify_sound", - "win_notify_loop_sound", "ocr_cpu_max_processes", "text_drop_score", "use_gpu", "line_break"] + "win_notify_loop_sound", "ocr_max_processes", "text_drop_score", "use_gpu", "line_break"] # Permanent values subarea_height_scaler = 0.75 @@ -51,7 +51,7 @@ class Config: default_text_extraction_batch_size = 100 default_onnx_intra_threads = 8 - default_ocr_cpu_max_processes = 6 + default_ocr_max_processes = 6 default_ocr_rec_language = "ch" default_text_drop_score = 0.7 default_line_break = False @@ -74,7 +74,7 @@ class Config: # Initial values frame_extraction_frequency = frame_extraction_batch_size = None - text_extraction_batch_size = onnx_intra_threads = ocr_cpu_max_processes = ocr_rec_language = text_drop_score = None + text_extraction_batch_size = onnx_intra_threads = ocr_max_processes = ocr_rec_language = text_drop_score = None text_similarity_threshold = min_consecutive_sub_dur_ms = max_consecutive_short_durs = min_sub_duration_ms = use_gpu = None split_start = split_stop = no_of_frames = sub_area_x_rel_padding = sub_area_y_abs_padding = use_search_area = None win_notify_sound = win_notify_loop_sound = line_break = None @@ -92,7 +92,7 @@ def create_default_config_file(self) -> None: self.keys[1]: self.default_frame_extraction_batch_size} self.config[self.sections[1]] = {self.keys[2]: self.default_text_extraction_batch_size, self.keys[3]: self.default_onnx_intra_threads, - self.keys[17]: self.default_ocr_cpu_max_processes, + self.keys[17]: self.default_ocr_max_processes, self.keys[4]: self.default_ocr_rec_language, self.keys[18]: self.default_text_drop_score, self.keys[20]: self.default_line_break} @@ -122,7 +122,7 @@ def load_config(cls) -> None: cls.text_extraction_batch_size = cls.config[cls.sections[1]].getint(cls.keys[2]) cls.onnx_intra_threads = cls.config[cls.sections[1]].getint(cls.keys[3]) - cls.ocr_cpu_max_processes = cls.config[cls.sections[1]].getint(cls.keys[17]) + cls.ocr_max_processes = cls.config[cls.sections[1]].getint(cls.keys[17]) cls.ocr_rec_language = cls.config[cls.sections[1]][cls.keys[4]] cls.text_drop_score = cls.config[cls.sections[1]].getfloat(cls.keys[18]) cls.line_break = cls.config[cls.sections[1]].getboolean(cls.keys[20]) @@ -160,8 +160,8 @@ def set_config(cls, **kwargs: int | float | str | bool) -> None: cls.config[cls.sections[1]][cls.keys[2]] = str(cls.text_extraction_batch_size) cls.onnx_intra_threads = kwargs.get(cls.keys[3], cls.onnx_intra_threads) cls.config[cls.sections[1]][cls.keys[3]] = str(cls.onnx_intra_threads) - cls.ocr_cpu_max_processes = kwargs.get(cls.keys[17], cls.ocr_cpu_max_processes) - cls.config[cls.sections[1]][cls.keys[17]] = str(cls.ocr_cpu_max_processes) + cls.ocr_max_processes = kwargs.get(cls.keys[17], cls.ocr_max_processes) + cls.config[cls.sections[1]][cls.keys[17]] = str(cls.ocr_max_processes) cls.ocr_rec_language = kwargs.get(cls.keys[4], cls.ocr_rec_language) cls.config[cls.sections[1]][cls.keys[4]] = cls.ocr_rec_language cls.text_drop_score = kwargs.get(cls.keys[18], cls.text_drop_score)