Skip to content

Commit 99389e1

Browse files
Merge 9259e3e into 2325979
2 parents 2325979 + 9259e3e commit 99389e1

8 files changed

Lines changed: 1902 additions & 2 deletions

File tree

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ dependencies = [
4949
"mdx_truly_sane_lists==1.3",
5050
"markdown-link-attr-modifier==0.2.1",
5151
"mdx-gh-links==0.4",
52+
# local image caption
53+
"onnxruntime == 1.19.2",
5254
]
5355

5456
[project.urls]
@@ -322,11 +324,11 @@ lint = [
322324
"pyright==1.1.401",
323325
]
324326
license-check = [
325-
"licensecheck==2024.3",
327+
"licensecheck==2025.1",
326328
]
327329
dev-docs = [
328330
# For building developer documentation
329-
"sphinx==8.1.2",
331+
"sphinx==8.1.3",
330332
"sphinx_rtd_theme==3.0.1",
331333
]
332334
system-tests = [

source/_localCaptioner/__init__.py

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
# -*- coding: UTF-8 -*-
2+
# A part of NonVisual Desktop Access (NVDA)
3+
# Copyright (C) 2025 NV Access Limited, tianze
4+
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
5+
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
6+
""" Localcaptioner module for NVDA.
7+
8+
This module provides local image captioning functionality using ONNX models.
9+
It allows users to capture screen regions and generate captions using local AI models.
10+
"""
11+
12+
from __future__ import unicode_literals
13+
14+
import os
15+
import sys
16+
from typing import Optional
17+
import base64
18+
import json
19+
import io
20+
import threading
21+
22+
import wx
23+
import gui
24+
from gui import guiHelper
25+
import globalVars
26+
import config
27+
from logHandler import log
28+
from keyboardHandler import KeyboardInputGesture
29+
import scriptHandler
30+
import ui
31+
import globalPluginHandler
32+
import api
33+
34+
from .captioner import ImageCaptioner
35+
from .modelManager import ModelManagerFrame
36+
from .panel import CaptionLocalSettingsPanel
37+
38+
try:
39+
import addonHandler
40+
addonHandler.initTranslation()
41+
except:
42+
pass
43+
44+
# Module-level configuration
45+
_here = os.path.dirname(__file__)
46+
_modelsDir = os.path.join(_here, "..", "..", "models")
47+
_modelsDir = os.path.abspath(_modelsDir)
48+
49+
CONFSPEC = {
50+
"localModelPath": f"string(default={_modelsDir}/Xenova/vit-gpt2-image-captioning)",
51+
"loadModelWhenInit": "boolean(default=false)"
52+
}
53+
54+
config.conf.spec['captionLocal'] = CONFSPEC
55+
56+
57+
def shootImage() -> bytes:
58+
"""Capture a screenshot of the current navigator object.
59+
60+
Returns:
61+
The captured image data as bytes in JPEG format.
62+
"""
63+
# Get the currently focused object on screen
64+
obj = api.getNavigatorObject()
65+
66+
# Get the object's position and size information
67+
x, y, width, height = obj.location
68+
69+
# Create a bitmap with the same size as the object
70+
bmp = wx.Bitmap(width, height)
71+
72+
# Create a memory device context for drawing operations on the bitmap
73+
mem = wx.MemoryDC(bmp)
74+
75+
# Copy the specified screen region to the memory bitmap
76+
mem.Blit(0, 0, width, height, wx.ScreenDC(), x, y)
77+
78+
# Convert the bitmap to an image object for more flexible operations
79+
image = bmp.ConvertToImage()
80+
81+
# Create a byte stream object to save image data as binary data
82+
body = io.BytesIO()
83+
84+
# Save the image to the byte stream in JPEG format
85+
image.SaveFile(body, wx.BITMAP_TYPE_JPEG)
86+
87+
# Read the binary image data from the byte stream
88+
imageData = body.getvalue()
89+
return imageData
90+
91+
92+
def caption(captioner: ImageCaptioner, imageData: bytes) -> None:
93+
"""Generate a caption for the given image data.
94+
95+
Args:
96+
captioner: The captioner instance to use for generation.
97+
imageData: The image data to caption.
98+
"""
99+
try:
100+
description = captioner.generate_caption(image=imageData)
101+
ui.message(description)
102+
result = api.copyToClip(text=description, notify=False)
103+
except Exception as e:
104+
ui.message(str(e))
105+
log.error(e)
106+
107+
def disableInSecureMode(decoratedCls):
108+
if globalVars.appArgs.secure:
109+
return globalPluginHandler.GlobalPlugin
110+
return decoratedCls
111+
112+
113+
@disableInSecureMode
114+
# class GlobalPlugin(globalPluginHandler.GlobalPlugin):
115+
class LocalCaptioner:
116+
"""Global plugin for Caption Local functionality.
117+
118+
This plugin provides image captioning using local ONNX models.
119+
It can capture screen regions and generate descriptive captions.
120+
"""
121+
122+
def __init__(self) -> None:
123+
"""Initialize the global plugin."""
124+
# super().__init__()
125+
self.isModelLoaded = False
126+
self.captioner: Optional[ImageCaptioner] = None
127+
self.managerFrame: Optional[ModelManagerFrame] = None
128+
129+
loadModelWhenInit = config.conf['captionLocal']['loadModelWhenInit']
130+
# Load model when initializing plugin (may cause high memory usage)
131+
if loadModelWhenInit:
132+
threading.Thread(target=self._loadModel, daemon=True).start()
133+
134+
gui.settingsDialogs.NVDASettingsDialog.categoryClasses.append(CaptionLocalSettingsPanel)
135+
136+
def terminate(self) -> None:
137+
"""Clean up resources when the plugin is terminated."""
138+
try:
139+
gui.settingsDialogs.NVDASettingsDialog.categoryClasses.remove(CaptionLocalSettingsPanel)
140+
except (ValueError, AttributeError):
141+
pass
142+
143+
144+
def runCaption(self, gesture) -> None:
145+
# def script_runCaption(self) -> None:
146+
"""Script to run image captioning on the current navigator object.
147+
148+
Args:
149+
gesture: The input gesture that triggered this script.
150+
"""
151+
imageData = shootImage()
152+
153+
if not self.isModelLoaded:
154+
# Translators: Message when loading the model
155+
ui.message(_("loading model..."))
156+
self._loadModel()
157+
158+
imageThread = threading.Thread(target=caption, args=(self.captioner, imageData))
159+
# Translators: Message when starting image recognition
160+
ui.message(_("starting recognize"))
161+
imageThread.start()
162+
163+
def _loadModel(self) -> None:
164+
"""Load the ONNX model for image captioning.
165+
166+
Raises:
167+
Exception: If the model cannot be loaded.
168+
"""
169+
try:
170+
localModelDirPath = config.conf['captionLocal']['localModelPath']
171+
encoderPath = f"{localModelDirPath}/onnx/encoder_model_quantized.onnx"
172+
decoderPath = f"{localModelDirPath}/onnx/decoder_model_merged_quantized.onnx"
173+
configPath = f"{localModelDirPath}/config.json"
174+
175+
self.captioner = ImageCaptioner(
176+
encoder_path=encoderPath,
177+
decoder_path=decoderPath,
178+
config_path=configPath,
179+
)
180+
self.isModelLoaded = True
181+
except Exception as e:
182+
self.isModelLoaded = False
183+
ui.message(str(e))
184+
raise
185+
186+
187+
def releaseModel(self, gesture) -> None:
188+
"""Script to release the loaded model from memory.
189+
190+
Args:
191+
gesture: The input gesture that triggered this script.
192+
"""
193+
# Translators: Message when releasing the model
194+
ui.message(_("releasing model..."))
195+
try:
196+
if hasattr(self, 'captioner') and self.captioner:
197+
del self.captioner
198+
self.captioner = None
199+
# Translators: Message when model is successfully released
200+
ui.message(_("model released and memory freed"))
201+
self.isModelLoaded = False
202+
except Exception as e:
203+
ui.message(str(e))
204+
raise
205+
206+
207+
def openManager(self, gesture) -> None:
208+
"""Script to open the model manager window.
209+
210+
Args:
211+
gesture: The input gesture that triggered this script.
212+
"""
213+
# Translators: Message when opening model manager
214+
ui.message(_("opening model manager..."))
215+
try:
216+
self._openModelManager()
217+
except Exception as e:
218+
ui.message(str(e))
219+
raise
220+
221+
def _openModelManager(self) -> None:
222+
"""Open the model manager frame window."""
223+
def showManager() -> None:
224+
"""Show the model manager window."""
225+
try:
226+
# Use existing wx.App if available
227+
app = wx.GetApp()
228+
if app is None:
229+
app = wx.App()
230+
231+
if not hasattr(self, 'managerFrame') or not self.managerFrame:
232+
self.managerFrame = ModelManagerFrame()
233+
234+
self.managerFrame.Show()
235+
self.managerFrame.Raise()
236+
237+
except Exception as e:
238+
ui.message(str(e))
239+
240+
# Ensure execution in main thread
241+
wx.CallAfter(showManager)
242+
243+
244+
245+
246+
247+
def getLocalCaptionerConfig():
248+
return config.conf["localcaptioner"]
249+
250+
251+
252+
253+
def initialize():
254+
"""Initialise the local captioner."""
255+
global _localCaptioner
256+
log.debug("Initializing local captioner")
257+
_localCaptioner = LocalCaptioner()
258+
259+
260+
261+
def terminate():
262+
"""Terminate the local captioner."""
263+
global _localCaptioner
264+
if _localCaptioner is None:
265+
log.debug("local captioner not running.")
266+
return
267+
log.debug("Terminating local captioner")
268+
_localCaptioner.terminate()
269+
_localCaptioner = None
270+
271+

0 commit comments

Comments
 (0)