|
21 | 21 | Optional, |
22 | 22 | Dict, |
23 | 23 | Tuple, |
| 24 | + Self, |
24 | 25 | ) |
25 | 26 |
|
26 | 27 | import baseObject |
@@ -402,11 +403,11 @@ def _get_boundingRects(self): |
402 | 403 |
|
403 | 404 | def unitIndex(self,unit): |
404 | 405 | """ |
405 | | -@param unit: a unit constant for which you want to retreave an index |
406 | | -@type: string |
407 | | -@returns: The 1-based index of this unit, out of all the units of this type in the object |
408 | | -@rtype: int |
409 | | -""" |
| 406 | + @param unit: a unit constant for which you want to retreave an index |
| 407 | + @type: string |
| 408 | + @returns: The 1-based index of this unit, out of all the units of this type in the object |
| 409 | + @rtype: int |
| 410 | + """ |
410 | 411 | raise NotImplementedError |
411 | 412 |
|
412 | 413 | def unitCount(self,unit): |
@@ -656,6 +657,183 @@ def getMathMl(self, field): |
656 | 657 | @raise LookupError: If MathML can't be retrieved for this field. |
657 | 658 | """ |
658 | 659 | raise NotImplementedError |
| 660 | + |
| 661 | + def moveToPythonicOffset( |
| 662 | + self, |
| 663 | + pythonicOffset: int, |
| 664 | + ) -> Self: |
| 665 | + """ |
| 666 | + This function moves textInfos by Pythonic characters. |
| 667 | +
|
| 668 | + Illustration: |
| 669 | + Suppose we have TextInfo that represents a paragraph of text: |
| 670 | + ``` |
| 671 | + > s = paragraphInfo.text |
| 672 | + > s |
| 673 | + 'Hello, world!\r' |
| 674 | + ``` |
| 675 | + Suppose that we would like to put the cursor at the first letter of the word 'world'. |
| 676 | + That means jumping to index 7: |
| 677 | + ``` |
| 678 | + > s[7:] |
| 679 | + 'world!\r' |
| 680 | + ``` |
| 681 | + Here is how this can be done: |
| 682 | + ``` |
| 683 | + > info = paragraphInfo.moveToPythonicOffset(7) |
| 684 | + > info.setEndPoint(paragraphInfo, "endToEnd") |
| 685 | + > info.text |
| 686 | + 'world!\r' |
| 687 | + ``` |
| 688 | +
|
| 689 | + Background: |
| 690 | + In many applications there is no one-to-one mapping of Pythonic characters and TextInfo characters, |
| 691 | + e.g. when calling TextInfo.move(UNIT_CHARACTER, n). |
| 692 | + There are a couple of reasons for this discrepancy: |
| 693 | + 1. In Wide character encoding, some 4-byte unicode characters are represented as two surrogate characters, |
| 694 | + whereas in pythonic string they would be represented by a single character. |
| 695 | + 2. In non-offset TextInfos (e.g. UIATextInfo) |
| 696 | + there is no guarantee on the fact that TextInfos.move(UNIT_CHARACTER, 1)would actually move by |
| 697 | + exactly 1 character. |
| 698 | + A good illustration of this is in Microsoft Word with UIA enabled always, |
| 699 | + the first character of a bullet list item would be represented by three pythonic characters: |
| 700 | + * Bullet character "•" |
| 701 | + * Tab character \t |
| 702 | + * And the first character of of list item per se. |
| 703 | +
|
| 704 | + In many use cases (e.g., sentence navigation, style navigation), |
| 705 | + we identify pythonic character that we would like to move our TextInfo to. |
| 706 | + TextInfos.move(UNIT_CHARACTER, n) would cause many side effects. |
| 707 | + This function provides a clean and reliable way to jump to a given pythonic offset. |
| 708 | +
|
| 709 | + Assumptions: |
| 710 | + 1. This function operates on a non-collapsed TextInfo only. IN a typical scenario, we might want |
| 711 | + to jump to a certain offset within a paragraph or a line. In this case this function |
| 712 | + should be called on TextInfo representing said paragraph or line. |
| 713 | + The reason for that is that for some implementations we might |
| 714 | + need to access text of paragraph/line in order to accurately compute result offset. |
| 715 | + 2. It assumes that 1 character of application-specific TextInfo representation |
| 716 | + maps to 1 or more characters of pythonic representation. |
| 717 | + 3. This function is also written with an assumption that a character |
| 718 | + in application-specific TextInfo representation might not map to any pythonic characters, |
| 719 | + although this scenario has never been observed in any applications. |
| 720 | + 4. Also this function assumes that most characters have 1:1 mapping between pythonic |
| 721 | + and application-specific representations. |
| 722 | + This assumption is not required, however if this assumption is True, the function will converge fast. |
| 723 | + If theis assumption is false, then it might take many iterations to find the right TextInfo. |
| 724 | +
|
| 725 | + Algorithm: |
| 726 | + This generic implementation essentially a biased binary search. |
| 727 | + On every iteration we operate on a pythonic string and its TextInfo counterpart stored in info variable. |
| 728 | + We would like to reach a certain offset within that pythonic string, |
| 729 | + that is stored in pythonicOffsetLeft variable. |
| 730 | + In every iteration of the loop: |
| 731 | + 1. We try to either move from the left end of info by pythonicOffsetLeft characters |
| 732 | + or from the right end by -pythonicOffsetRight characters - depending which move is shorter. |
| 733 | + We store destination point as collapsed TextInfo tmpInfo. |
| 734 | + 2. We compute number of pythonic characters from the beginning of info until tmpInfo |
| 735 | + and store it in actualPythonicOffset variable. |
| 736 | + 3. We will compare actualPythonicOffset with pythonicOffsetLeft : if they are equal, |
| 737 | + then we just found desired TextInfo. |
| 738 | + Otherwise we use tmpInfo as the middle point of binary search and we recurse either to the left |
| 739 | + or to the right, depending where desired offset lies. |
| 740 | +
|
| 741 | + One extra part of the algorithm serves to prevent certain conditions: |
| 742 | + if we happen to move on the step 1 from the same point twice |
| 743 | + in two consecutive iterations of the loop, then on the second time we will move tmpInfo |
| 744 | + exactly to the opposite end of info, |
| 745 | + and the algorithm will fail on sanity check condition in the for loop. |
| 746 | + To avoid this situation we track last move and the direction of last divide |
| 747 | + in variables lastMove and lastRecursedLeft. |
| 748 | + If we detect that we are about to move from the same endpoint for the second time, |
| 749 | + we reduce the count of characters by 1 to make sure |
| 750 | + the algorithm makes some progress on each iteration. |
| 751 | + """ |
| 752 | + text = self.text |
| 753 | + if pythonicOffset < 0 or pythonicOffset > len(text): |
| 754 | + raise ValueError |
| 755 | + if pythonicOffset == 0 or pythonicOffset == len(text): |
| 756 | + result = self.copy() |
| 757 | + result.collapse(end=pythonicOffset > 0) |
| 758 | + return result |
| 759 | + |
| 760 | + info = self.copy() |
| 761 | + # Total Pythonic Length represents length in python characters of Current TextInfo we're workoing with. |
| 762 | + # We start with self, and then gradually divide and conquer in order to find desired offset. |
| 763 | + totalPythonicOffset = len(text) |
| 764 | + |
| 765 | + # pythonicOffsetLeft and pythonicOffsetRight represent distance in pythonic characters |
| 766 | + # from left and right ends of info correspondingly until desired location. |
| 767 | + pythonicOffsetLeft = pythonicOffset |
| 768 | + pythonicOffsetRight = totalPythonicOffset - pythonicOffsetLeft |
| 769 | + |
| 770 | + # We store lastMove - by how many characters we moved last time, and |
| 771 | + # lastRecursedLeft - whether last recursion happened to the left and not to the right - |
| 772 | + # in order to avoid certain corner cases. |
| 773 | + lastMove: int | None = None |
| 774 | + lastRecursedLeft: bool | None = None |
| 775 | + |
| 776 | + MAX_BINARY_SEARCH_ITERATIONS = 1000 |
| 777 | + for __ in range(MAX_BINARY_SEARCH_ITERATIONS): |
| 778 | + tmpInfo = info.copy() |
| 779 | + if pythonicOffsetLeft <= pythonicOffsetRight: |
| 780 | + # Move from the left end of info. Let's compute by how many characters in moveCharacters variable. |
| 781 | + tmpInfo.collapse() |
| 782 | + if lastRecursedLeft is not None and lastRecursedLeft is True and lastMove > 0: |
| 783 | + # Here we check that last time we also attempted to move from the same left end. |
| 784 | + # And apparently we overshot last time. In order to avoid infinite loop |
| 785 | + # or overshooting again, reduce movement by 1. |
| 786 | + moveCharacters = lastMove - 1 |
| 787 | + if moveCharacters == 0: |
| 788 | + raise RuntimeError("Unable to find desired offset in TextInfo.") |
| 789 | + else: |
| 790 | + moveCharacters = pythonicOffsetLeft |
| 791 | + |
| 792 | + code = tmpInfo.move(UNIT_CHARACTER, moveCharacters, endPoint="end") |
| 793 | + lastMove = moveCharacters |
| 794 | + tmpText = tmpInfo.text |
| 795 | + actualPythonicOffset = len(tmpText) |
| 796 | + tmpInfo.collapse(end=True) |
| 797 | + else: |
| 798 | + # Move from the right end of info. |
| 799 | + tmpInfo.collapse(end=True) |
| 800 | + if lastRecursedLeft is not None and lastRecursedLeft is False and lastMove < 0: |
| 801 | + # lastMove was negative, so adding +1 to reduce its absolute value |
| 802 | + moveCharacters = lastMove + 1 |
| 803 | + if moveCharacters == 0: |
| 804 | + raise RuntimeError("Unable to find desired offset in TextInfo.") |
| 805 | + else: |
| 806 | + moveCharacters = -pythonicOffsetRight |
| 807 | + code = tmpInfo.move(UNIT_CHARACTER, moveCharacters, endPoint="start") |
| 808 | + lastMove = moveCharacters |
| 809 | + tmpText = tmpInfo.text |
| 810 | + actualPythonicOffset = totalPythonicOffset - len(tmpText) |
| 811 | + tmpInfo.collapse() |
| 812 | + if code == 0: |
| 813 | + raise RuntimeError("Move by character operation unexpectedly failed.") |
| 814 | + if actualPythonicOffset <= 0 or actualPythonicOffset >= totalPythonicOffset: |
| 815 | + raise RuntimeError(f"InvalidState: {actualPythonicOffset=} {totalPythonicOffset=}") |
| 816 | + if actualPythonicOffset == pythonicOffsetLeft: |
| 817 | + return tmpInfo |
| 818 | + elif actualPythonicOffset < pythonicOffsetLeft: |
| 819 | + # Recursing right |
| 820 | + lastRecursedLeft = False |
| 821 | + text = text[actualPythonicOffset:] |
| 822 | + pythonicOffsetLeft -= actualPythonicOffset |
| 823 | + totalPythonicOffset = pythonicOffsetLeft + pythonicOffsetRight |
| 824 | + info.setEndPoint(tmpInfo, which="startToStart") |
| 825 | + else: # actualPythonicOffset > pythonicOffsetLeft |
| 826 | + # Recursing left |
| 827 | + lastRecursedLeft = True |
| 828 | + text = text[:actualPythonicOffset] |
| 829 | + totalPythonicOffset = actualPythonicOffset |
| 830 | + pythonicOffsetRight = totalPythonicOffset - pythonicOffsetLeft |
| 831 | + info.setEndPoint(tmpInfo, which="endToEnd") |
| 832 | + raise RuntimeError("Infinite loop during binary search.") |
| 833 | + |
| 834 | + |
| 835 | + |
| 836 | + |
659 | 837 |
|
660 | 838 | RE_EOL = re.compile("\r\n|[\n\r]") |
661 | 839 | def convertToCrlf(text): |
|
0 commit comments