|
| 1 | +# -*- coding: UTF-8 -*- |
| 2 | +# A part of NonVisual Desktop Access (NVDA) |
| 3 | +# Copyright (C) 2025 NV Access Limited, tianze |
| 4 | +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. |
| 5 | +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt |
| 6 | +""" Localcaptioner module for NVDA. |
| 7 | +
|
| 8 | +This module provides local image captioning functionality using ONNX models. |
| 9 | +It allows users to capture screen regions and generate captions using local AI models. |
| 10 | +""" |
| 11 | + |
| 12 | +from __future__ import unicode_literals |
| 13 | + |
| 14 | +import os |
| 15 | +import sys |
| 16 | +from typing import Optional |
| 17 | +import base64 |
| 18 | +import json |
| 19 | +import io |
| 20 | +import threading |
| 21 | + |
| 22 | +import wx |
| 23 | +import gui |
| 24 | +from gui import guiHelper |
| 25 | +import globalVars |
| 26 | +import config |
| 27 | +from logHandler import log |
| 28 | +from keyboardHandler import KeyboardInputGesture |
| 29 | +import scriptHandler |
| 30 | +import ui |
| 31 | +import globalPluginHandler |
| 32 | +import api |
| 33 | + |
| 34 | +from .captioner import ImageCaptioner |
| 35 | +from .modelManager import ModelManagerFrame |
| 36 | +from .panel import CaptionLocalSettingsPanel |
| 37 | + |
| 38 | +try: |
| 39 | + import addonHandler |
| 40 | + addonHandler.initTranslation() |
| 41 | +except: |
| 42 | + pass |
| 43 | + |
| 44 | +# Module-level configuration |
| 45 | +_here = os.path.dirname(__file__) |
| 46 | +_modelsDir = os.path.join(_here, "..", "..", "models") |
| 47 | +_modelsDir = os.path.abspath(_modelsDir) |
| 48 | + |
| 49 | +CONFSPEC = { |
| 50 | + "localModelPath": f"string(default={_modelsDir}/Xenova/vit-gpt2-image-captioning)", |
| 51 | + "loadModelWhenInit": "boolean(default=false)" |
| 52 | +} |
| 53 | + |
| 54 | +config.conf.spec['captionLocal'] = CONFSPEC |
| 55 | + |
| 56 | + |
| 57 | +def shootImage() -> bytes: |
| 58 | + """Capture a screenshot of the current navigator object. |
| 59 | + |
| 60 | + Returns: |
| 61 | + The captured image data as bytes in JPEG format. |
| 62 | + """ |
| 63 | + # Get the currently focused object on screen |
| 64 | + obj = api.getNavigatorObject() |
| 65 | + |
| 66 | + # Get the object's position and size information |
| 67 | + x, y, width, height = obj.location |
| 68 | + |
| 69 | + # Create a bitmap with the same size as the object |
| 70 | + bmp = wx.Bitmap(width, height) |
| 71 | + |
| 72 | + # Create a memory device context for drawing operations on the bitmap |
| 73 | + mem = wx.MemoryDC(bmp) |
| 74 | + |
| 75 | + # Copy the specified screen region to the memory bitmap |
| 76 | + mem.Blit(0, 0, width, height, wx.ScreenDC(), x, y) |
| 77 | + |
| 78 | + # Convert the bitmap to an image object for more flexible operations |
| 79 | + image = bmp.ConvertToImage() |
| 80 | + |
| 81 | + # Create a byte stream object to save image data as binary data |
| 82 | + body = io.BytesIO() |
| 83 | + |
| 84 | + # Save the image to the byte stream in JPEG format |
| 85 | + image.SaveFile(body, wx.BITMAP_TYPE_JPEG) |
| 86 | + |
| 87 | + # Read the binary image data from the byte stream |
| 88 | + imageData = body.getvalue() |
| 89 | + return imageData |
| 90 | + |
| 91 | + |
| 92 | +def caption(captioner: ImageCaptioner, imageData: bytes) -> None: |
| 93 | + """Generate a caption for the given image data. |
| 94 | + |
| 95 | + Args: |
| 96 | + captioner: The captioner instance to use for generation. |
| 97 | + imageData: The image data to caption. |
| 98 | + """ |
| 99 | + try: |
| 100 | + description = captioner.generate_caption(image=imageData) |
| 101 | + ui.message(description) |
| 102 | + result = api.copyToClip(text=description, notify=False) |
| 103 | + except Exception as e: |
| 104 | + ui.message(str(e)) |
| 105 | + log.error(e) |
| 106 | + |
| 107 | +def disableInSecureMode(decoratedCls): |
| 108 | + if globalVars.appArgs.secure: |
| 109 | + return globalPluginHandler.GlobalPlugin |
| 110 | + return decoratedCls |
| 111 | + |
| 112 | + |
| 113 | +@disableInSecureMode |
| 114 | +# class GlobalPlugin(globalPluginHandler.GlobalPlugin): |
| 115 | +class LocalCaptioner: |
| 116 | + """Global plugin for Caption Local functionality. |
| 117 | + |
| 118 | + This plugin provides image captioning using local ONNX models. |
| 119 | + It can capture screen regions and generate descriptive captions. |
| 120 | + """ |
| 121 | + |
| 122 | + def __init__(self) -> None: |
| 123 | + """Initialize the global plugin.""" |
| 124 | + # super().__init__() |
| 125 | + self.isModelLoaded = False |
| 126 | + self.captioner: Optional[ImageCaptioner] = None |
| 127 | + self.managerFrame: Optional[ModelManagerFrame] = None |
| 128 | + |
| 129 | + loadModelWhenInit = config.conf['captionLocal']['loadModelWhenInit'] |
| 130 | + # Load model when initializing plugin (may cause high memory usage) |
| 131 | + if loadModelWhenInit: |
| 132 | + threading.Thread(target=self._loadModel, daemon=True).start() |
| 133 | + |
| 134 | + gui.settingsDialogs.NVDASettingsDialog.categoryClasses.append(CaptionLocalSettingsPanel) |
| 135 | + |
| 136 | + def terminate(self) -> None: |
| 137 | + """Clean up resources when the plugin is terminated.""" |
| 138 | + try: |
| 139 | + gui.settingsDialogs.NVDASettingsDialog.categoryClasses.remove(CaptionLocalSettingsPanel) |
| 140 | + except (ValueError, AttributeError): |
| 141 | + pass |
| 142 | + |
| 143 | + |
| 144 | + def runCaption(self, gesture) -> None: |
| 145 | + # def script_runCaption(self) -> None: |
| 146 | + """Script to run image captioning on the current navigator object. |
| 147 | + |
| 148 | + Args: |
| 149 | + gesture: The input gesture that triggered this script. |
| 150 | + """ |
| 151 | + imageData = shootImage() |
| 152 | + |
| 153 | + if not self.isModelLoaded: |
| 154 | + # Translators: Message when loading the model |
| 155 | + ui.message(_("loading model...")) |
| 156 | + self._loadModel() |
| 157 | + |
| 158 | + imageThread = threading.Thread(target=caption, args=(self.captioner, imageData)) |
| 159 | + # Translators: Message when starting image recognition |
| 160 | + ui.message(_("starting recognize")) |
| 161 | + imageThread.start() |
| 162 | + |
| 163 | + def _loadModel(self) -> None: |
| 164 | + """Load the ONNX model for image captioning. |
| 165 | + |
| 166 | + Raises: |
| 167 | + Exception: If the model cannot be loaded. |
| 168 | + """ |
| 169 | + try: |
| 170 | + localModelDirPath = config.conf['captionLocal']['localModelPath'] |
| 171 | + encoderPath = f"{localModelDirPath}/onnx/encoder_model_quantized.onnx" |
| 172 | + decoderPath = f"{localModelDirPath}/onnx/decoder_model_merged_quantized.onnx" |
| 173 | + configPath = f"{localModelDirPath}/config.json" |
| 174 | + |
| 175 | + self.captioner = ImageCaptioner( |
| 176 | + encoder_path=encoderPath, |
| 177 | + decoder_path=decoderPath, |
| 178 | + config_path=configPath, |
| 179 | + ) |
| 180 | + self.isModelLoaded = True |
| 181 | + except Exception as e: |
| 182 | + self.isModelLoaded = False |
| 183 | + ui.message(str(e)) |
| 184 | + raise |
| 185 | + |
| 186 | + |
| 187 | + def releaseModel(self, gesture) -> None: |
| 188 | + """Script to release the loaded model from memory. |
| 189 | + |
| 190 | + Args: |
| 191 | + gesture: The input gesture that triggered this script. |
| 192 | + """ |
| 193 | + # Translators: Message when releasing the model |
| 194 | + ui.message(_("releasing model...")) |
| 195 | + try: |
| 196 | + if hasattr(self, 'captioner') and self.captioner: |
| 197 | + del self.captioner |
| 198 | + self.captioner = None |
| 199 | + # Translators: Message when model is successfully released |
| 200 | + ui.message(_("model released and memory freed")) |
| 201 | + self.isModelLoaded = False |
| 202 | + except Exception as e: |
| 203 | + ui.message(str(e)) |
| 204 | + raise |
| 205 | + |
| 206 | + |
| 207 | + def openManager(self, gesture) -> None: |
| 208 | + """Script to open the model manager window. |
| 209 | + |
| 210 | + Args: |
| 211 | + gesture: The input gesture that triggered this script. |
| 212 | + """ |
| 213 | + # Translators: Message when opening model manager |
| 214 | + ui.message(_("opening model manager...")) |
| 215 | + try: |
| 216 | + self._openModelManager() |
| 217 | + except Exception as e: |
| 218 | + ui.message(str(e)) |
| 219 | + raise |
| 220 | + |
| 221 | + def _openModelManager(self) -> None: |
| 222 | + """Open the model manager frame window.""" |
| 223 | + def showManager() -> None: |
| 224 | + """Show the model manager window.""" |
| 225 | + try: |
| 226 | + # Use existing wx.App if available |
| 227 | + app = wx.GetApp() |
| 228 | + if app is None: |
| 229 | + app = wx.App() |
| 230 | + |
| 231 | + if not hasattr(self, 'managerFrame') or not self.managerFrame: |
| 232 | + self.managerFrame = ModelManagerFrame() |
| 233 | + |
| 234 | + self.managerFrame.Show() |
| 235 | + self.managerFrame.Raise() |
| 236 | + |
| 237 | + except Exception as e: |
| 238 | + ui.message(str(e)) |
| 239 | + |
| 240 | + # Ensure execution in main thread |
| 241 | + wx.CallAfter(showManager) |
| 242 | + |
| 243 | + |
| 244 | + |
| 245 | + |
| 246 | + |
| 247 | +def getLocalCaptionerConfig(): |
| 248 | + return config.conf["localcaptioner"] |
| 249 | + |
| 250 | + |
| 251 | + |
| 252 | + |
| 253 | +def initialize(): |
| 254 | + """Initialise the local captioner.""" |
| 255 | + global _localCaptioner |
| 256 | + log.debug("Initializing local captioner") |
| 257 | + _localCaptioner = LocalCaptioner() |
| 258 | + |
| 259 | + |
| 260 | + |
| 261 | +def terminate(): |
| 262 | + """Terminate the local captioner.""" |
| 263 | + global _localCaptioner |
| 264 | + if _localCaptioner is None: |
| 265 | + log.debug("local captioner not running.") |
| 266 | + return |
| 267 | + log.debug("Terminating local captioner") |
| 268 | + _localCaptioner.terminate() |
| 269 | + _localCaptioner = None |
| 270 | + |
| 271 | + |
0 commit comments