Skip to content

Commit 0200bcb

Browse files
Merge 7f460c8 into 37be7b8
2 parents 37be7b8 + 7f460c8 commit 0200bcb

8 files changed

Lines changed: 2018 additions & 2 deletions

File tree

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ dependencies = [
4949
"mdx_truly_sane_lists==1.3",
5050
"markdown-link-attr-modifier==0.2.1",
5151
"mdx-gh-links==0.4",
52+
# local image caption
53+
"onnxruntime == 1.19.2",
5254
]
5355

5456
[project.urls]
@@ -322,11 +324,11 @@ lint = [
322324
"pyright==1.1.401",
323325
]
324326
license-check = [
325-
"licensecheck==2024.3",
327+
"licensecheck==2025.1",
326328
]
327329
dev-docs = [
328330
# For building developer documentation
329-
"sphinx==8.1.2",
331+
"sphinx==8.1.3",
330332
"sphinx_rtd_theme==3.0.1",
331333
]
332334
system-tests = [

source/_localCaptioner/__init__.py

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
# A part of NonVisual Desktop Access (NVDA)
2+
# Copyright (C) 2025 NV Access Limited, tianze
3+
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
4+
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
5+
"""Localcaptioner module for NVDA.
6+
7+
This module provides local image captioning functionality using ONNX models.
8+
It allows users to capture screen regions and generate captions using local AI models.
9+
"""
10+
11+
from __future__ import unicode_literals
12+
13+
import os
14+
import sys
15+
16+
import base64
17+
import json
18+
import io
19+
import threading
20+
21+
import wx
22+
import gui
23+
from gui import guiHelper
24+
import globalVars
25+
import config
26+
from logHandler import log
27+
from keyboardHandler import KeyboardInputGesture
28+
import scriptHandler
29+
import ui
30+
import globalPluginHandler
31+
import api
32+
33+
from .captioner import ImageCaptioner
34+
from .modelManager import ModelManagerFrame
35+
from .panel import CaptionLocalSettingsPanel
36+
37+
try:
38+
import addonHandler
39+
40+
addonHandler.initTranslation()
41+
except:
42+
pass
43+
44+
# Module-level configuration
45+
_here = os.path.dirname(__file__)
46+
_modelsDir = os.path.join(_here, "..", "..", "models")
47+
_modelsDir = os.path.abspath(_modelsDir)
48+
49+
CONFSPEC = {
50+
"localModelPath": f"string(default={_modelsDir}/Xenova/vit-gpt2-image-captioning)",
51+
"loadModelWhenInit": "boolean(default=false)",
52+
}
53+
54+
config.conf.spec["captionLocal"] = CONFSPEC
55+
56+
57+
def shootImage() -> bytes:
58+
"""Capture a screenshot of the current navigator object.
59+
60+
Returns:
61+
The captured image data as bytes in JPEG format.
62+
"""
63+
# Get the currently focused object on screen
64+
obj = api.getNavigatorObject()
65+
66+
# Get the object's position and size information
67+
x, y, width, height = obj.location
68+
69+
# Create a bitmap with the same size as the object
70+
bmp = wx.Bitmap(width, height)
71+
72+
# Create a memory device context for drawing operations on the bitmap
73+
mem = wx.MemoryDC(bmp)
74+
75+
# Copy the specified screen region to the memory bitmap
76+
mem.Blit(0, 0, width, height, wx.ScreenDC(), x, y)
77+
78+
# Convert the bitmap to an image object for more flexible operations
79+
image = bmp.ConvertToImage()
80+
81+
# Create a byte stream object to save image data as binary data
82+
body = io.BytesIO()
83+
84+
# Save the image to the byte stream in JPEG format
85+
image.SaveFile(body, wx.BITMAP_TYPE_JPEG)
86+
87+
# Read the binary image data from the byte stream
88+
imageData = body.getvalue()
89+
return imageData
90+
91+
92+
def caption(captioner: ImageCaptioner, imageData: bytes) -> None:
93+
"""Generate a caption for the given image data.
94+
95+
Args:
96+
captioner: The captioner instance to use for generation.
97+
imageData: The image data to caption.
98+
"""
99+
try:
100+
description = captioner.generate_caption(image=imageData)
101+
ui.message(description)
102+
result = api.copyToClip(text=description, notify=False)
103+
except Exception as e:
104+
ui.message(str(e))
105+
log.error(e)
106+
107+
108+
def disableInSecureMode(decoratedCls):
109+
if globalVars.appArgs.secure:
110+
return globalPluginHandler.GlobalPlugin
111+
return decoratedCls
112+
113+
114+
@disableInSecureMode
115+
# class GlobalPlugin(globalPluginHandler.GlobalPlugin):
116+
class LocalCaptioner:
117+
"""Global plugin for Caption Local functionality.
118+
119+
This plugin provides image captioning using local ONNX models.
120+
It can capture screen regions and generate descriptive captions.
121+
"""
122+
123+
def __init__(self) -> None:
124+
"""Initialize the global plugin."""
125+
# super().__init__()
126+
self.isModelLoaded = False
127+
self.captioner: ImageCaptioner | None = None
128+
self.managerFrame: ModelManagerFrame | None = None
129+
130+
loadModelWhenInit = config.conf["captionLocal"]["loadModelWhenInit"]
131+
# Load model when initializing plugin (may cause high memory usage)
132+
if loadModelWhenInit:
133+
threading.Thread(target=self._loadModel, daemon=True).start()
134+
135+
gui.settingsDialogs.NVDASettingsDialog.categoryClasses.append(CaptionLocalSettingsPanel)
136+
137+
def terminate(self) -> None:
138+
"""Clean up resources when the plugin is terminated."""
139+
try:
140+
gui.settingsDialogs.NVDASettingsDialog.categoryClasses.remove(CaptionLocalSettingsPanel)
141+
except (ValueError, AttributeError):
142+
pass
143+
144+
def runCaption(self, gesture) -> None:
145+
# def script_runCaption(self) -> None:
146+
"""Script to run image captioning on the current navigator object.
147+
148+
Args:
149+
gesture: The input gesture that triggered this script.
150+
"""
151+
imageData = shootImage()
152+
153+
if not self.isModelLoaded:
154+
# Translators: Message when loading the model
155+
ui.message(_("loading model..."))
156+
self._loadModel()
157+
158+
imageThread = threading.Thread(target=caption, args=(self.captioner, imageData))
159+
# Translators: Message when starting image recognition
160+
ui.message(_("starting recognize"))
161+
imageThread.start()
162+
163+
def _loadModel(self) -> None:
164+
"""Load the ONNX model for image captioning.
165+
166+
Raises:
167+
Exception: If the model cannot be loaded.
168+
"""
169+
try:
170+
localModelDirPath = config.conf["captionLocal"]["localModelPath"]
171+
encoderPath = f"{localModelDirPath}/onnx/encoder_model_quantized.onnx"
172+
decoderPath = f"{localModelDirPath}/onnx/decoder_model_merged_quantized.onnx"
173+
configPath = f"{localModelDirPath}/config.json"
174+
175+
self.captioner = ImageCaptioner(
176+
encoder_path=encoderPath,
177+
decoder_path=decoderPath,
178+
config_path=configPath,
179+
)
180+
self.isModelLoaded = True
181+
except Exception as e:
182+
self.isModelLoaded = False
183+
ui.message(str(e))
184+
raise
185+
186+
def releaseModel(self, gesture) -> None:
187+
"""Script to release the loaded model from memory.
188+
189+
Args:
190+
gesture: The input gesture that triggered this script.
191+
"""
192+
# Translators: Message when releasing the model
193+
ui.message(_("releasing model..."))
194+
try:
195+
if hasattr(self, "captioner") and self.captioner:
196+
del self.captioner
197+
self.captioner = None
198+
# Translators: Message when model is successfully released
199+
ui.message(_("model released and memory freed"))
200+
self.isModelLoaded = False
201+
except Exception as e:
202+
ui.message(str(e))
203+
raise
204+
205+
def openManager(self, gesture) -> None:
206+
"""Script to open the model manager window.
207+
208+
Args:
209+
gesture: The input gesture that triggered this script.
210+
"""
211+
# Translators: Message when opening model manager
212+
ui.message(_("opening model manager..."))
213+
try:
214+
self._openModelManager()
215+
except Exception as e:
216+
ui.message(str(e))
217+
raise
218+
219+
def _openModelManager(self) -> None:
220+
"""Open the model manager frame window."""
221+
222+
def showManager() -> None:
223+
"""Show the model manager window."""
224+
try:
225+
# Use existing wx.App if available
226+
app = wx.GetApp()
227+
if app is None:
228+
app = wx.App()
229+
230+
if not hasattr(self, "managerFrame") or not self.managerFrame:
231+
self.managerFrame = ModelManagerFrame()
232+
233+
self.managerFrame.Show()
234+
self.managerFrame.Raise()
235+
236+
except Exception as e:
237+
ui.message(str(e))
238+
239+
# Ensure execution in main thread
240+
wx.CallAfter(showManager)
241+
242+
243+
def getLocalCaptionerConfig():
244+
return config.conf["localcaptioner"]
245+
246+
247+
def initialize():
248+
"""Initialise the local captioner."""
249+
global _localCaptioner
250+
log.debug("Initializing local captioner")
251+
_localCaptioner = LocalCaptioner()
252+
253+
254+
def terminate():
255+
"""Terminate the local captioner."""
256+
global _localCaptioner
257+
if _localCaptioner is None:
258+
log.debug("local captioner not running.")
259+
return
260+
log.debug("Terminating local captioner")
261+
_localCaptioner.terminate()
262+
_localCaptioner = None

0 commit comments

Comments
 (0)