Skip to content
This repository was archived by the owner on Nov 23, 2024. It is now read-only.
This repository was archived by the owner on Nov 23, 2024. It is now read-only.

Boundry annotation wrongly created: The description only described an if-case for the boundry, not a requirement. #35

@Masara

Description

@Masara

URL Hash

#/sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df

Actual Annotation Type

@boundary

Actual Annotation Inputs

{
    "target": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df",
    "authors": [
        "$autogen$"
    ],
    "interval": {
        "isDiscrete": false,
        "lowerIntervalLimit": 0,
        "lowerLimitType": 0,
        "upperIntervalLimit": 1,
        "upperLimitType": 0
    }
}

Expected Annotation Type

No annotation

Expected Annotation Inputs

Minimal API Data (optional)

Minimal API Data for `sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df`
{
    "schemaVersion": 1,
    "distribution": "scikit-learn",
    "package": "sklearn",
    "version": "1.1.1",
    "modules": [
        {
            "id": "sklearn/sklearn.feature_extraction.text",
            "name": "sklearn.feature_extraction.text",
            "imports": [
                {
                    "module": "array",
                    "alias": null
                },
                {
                    "module": "numbers",
                    "alias": null
                },
                {
                    "module": "numpy",
                    "alias": "np"
                },
                {
                    "module": "re",
                    "alias": null
                },
                {
                    "module": "scipy.sparse",
                    "alias": "sp"
                },
                {
                    "module": "unicodedata",
                    "alias": null
                },
                {
                    "module": "warnings",
                    "alias": null
                }
            ],
            "from_imports": [
                {
                    "module": "collections",
                    "declaration": "defaultdict",
                    "alias": null
                },
                {
                    "module": "collections.abc",
                    "declaration": "Mapping",
                    "alias": null
                },
                {
                    "module": "functools",
                    "declaration": "partial",
                    "alias": null
                },
                {
                    "module": "operator",
                    "declaration": "itemgetter",
                    "alias": null
                },
                {
                    "module": "sklearn.base",
                    "declaration": "_OneToOneFeatureMixin",
                    "alias": null
                },
                {
                    "module": "sklearn.base",
                    "declaration": "BaseEstimator",
                    "alias": null
                },
                {
                    "module": "sklearn.base",
                    "declaration": "TransformerMixin",
                    "alias": null
                },
                {
                    "module": "sklearn.exceptions",
                    "declaration": "NotFittedError",
                    "alias": null
                },
                {
                    "module": "sklearn.feature_extraction._hash",
                    "declaration": "FeatureHasher",
                    "alias": null
                },
                {
                    "module": "sklearn.feature_extraction._stop_words",
                    "declaration": "ENGLISH_STOP_WORDS",
                    "alias": null
                },
                {
                    "module": "sklearn.preprocessing",
                    "declaration": "normalize",
                    "alias": null
                },
                {
                    "module": "sklearn.utils",
                    "declaration": "_IS_32BIT",
                    "alias": null
                },
                {
                    "module": "sklearn.utils.deprecation",
                    "declaration": "deprecated",
                    "alias": null
                },
                {
                    "module": "sklearn.utils.validation",
                    "declaration": "check_array",
                    "alias": null
                },
                {
                    "module": "sklearn.utils.validation",
                    "declaration": "check_is_fitted",
                    "alias": null
                },
                {
                    "module": "sklearn.utils.validation",
                    "declaration": "check_scalar",
                    "alias": null
                },
                {
                    "module": "sklearn.utils.validation",
                    "declaration": "FLOAT_DTYPES",
                    "alias": null
                }
            ],
            "classes": [
                "sklearn/sklearn.feature_extraction.text/TfidfVectorizer"
            ],
            "functions": []
        }
    ],
    "classes": [
        {
            "id": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer",
            "name": "TfidfVectorizer",
            "qname": "sklearn.feature_extraction.text.TfidfVectorizer",
            "decorators": [],
            "superclasses": [
                "CountVectorizer"
            ],
            "methods": [
                "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__"
            ],
            "is_public": true,
            "reexported_by": [],
            "description": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by\n:class:`TfidfTransformer`.\n\nRead more in the :ref:`User Guide <text_feature_extraction>`.",
            "docstring": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by\n:class:`TfidfTransformer`.\n\nRead more in the :ref:`User Guide <text_feature_extraction>`.\n\nParameters\n----------\ninput : {'filename', 'file', 'content'}, default='content'\n    - If `'filename'`, the sequence passed as an argument to fit is\n      expected to be a list of filenames that need reading to fetch\n      the raw content to analyze.\n\n    - If `'file'`, the sequence items must have a 'read' method (file-like\n      object) that is called to fetch the bytes in memory.\n\n    - If `'content'`, the input is expected to be a sequence of items that\n      can be of type string or byte.\n\nencoding : str, default='utf-8'\n    If bytes or files are given to analyze, this encoding is used to\n    decode.\n\ndecode_error : {'strict', 'ignore', 'replace'}, default='strict'\n    Instruction on what to do if a byte sequence is given to analyze that\n    contains characters not of the given `encoding`. By default, it is\n    'strict', meaning that a UnicodeDecodeError will be raised. Other\n    values are 'ignore' and 'replace'.\n\nstrip_accents : {'ascii', 'unicode'}, default=None\n    Remove accents and perform other character normalization\n    during the preprocessing step.\n    'ascii' is a fast method that only works on characters that have\n    an direct ASCII mapping.\n    'unicode' is a slightly slower method that works on any characters.\n    None (default) does nothing.\n\n    Both 'ascii' and 'unicode' use NFKD normalization from\n    :func:`unicodedata.normalize`.\n\nlowercase : bool, default=True\n    Convert all characters to lowercase before tokenizing.\n\npreprocessor : callable, default=None\n    Override the preprocessing (string transformation) stage while\n    preserving the tokenizing and n-grams generation steps.\n    Only applies if ``analyzer`` is not callable.\n\ntokenizer : callable, default=None\n    Override the string tokenization step while preserving the\n    preprocessing and n-grams generation steps.\n    Only applies if ``analyzer == 'word'``.\n\nanalyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n    Whether the feature should be made of word or character n-grams.\n    Option 'char_wb' creates character n-grams only from text inside\n    word boundaries; n-grams at the edges of words are padded with space.\n\n    If a callable is passed it is used to extract the sequence of features\n    out of the raw, unprocessed input.\n\n    .. versionchanged:: 0.21\n        Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n        is first read from the file and then passed to the given callable\n        analyzer.\n\nstop_words : {'english'}, list, default=None\n    If a string, it is passed to _check_stop_list and the appropriate stop\n    list is returned. 'english' is currently the only supported string\n    value.\n    There are several known issues with 'english' and you should\n    consider an alternative (see :ref:`stop_words`).\n\n    If a list, that list is assumed to contain stop words, all of which\n    will be removed from the resulting tokens.\n    Only applies if ``analyzer == 'word'``.\n\n    If None, no stop words will be used. max_df can be set to a value\n    in the range [0.7, 1.0) to automatically detect and filter stop\n    words based on intra corpus document frequency of terms.\n\ntoken_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n    Regular expression denoting what constitutes a \"token\", only used\n    if ``analyzer == 'word'``. The default regexp selects tokens of 2\n    or more alphanumeric characters (punctuation is completely ignored\n    and always treated as a token separator).\n\n    If there is a capturing group in token_pattern then the\n    captured group content, not the entire match, becomes the token.\n    At most one capturing group is permitted.\n\nngram_range : tuple (min_n, max_n), default=(1, 1)\n    The lower and upper boundary of the range of n-values for different\n    n-grams to be extracted. All values of n such that min_n <= n <= max_n\n    will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n    unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n    only bigrams.\n    Only applies if ``analyzer`` is not callable.\n\nmax_df : float or int, default=1.0\n    When building the vocabulary ignore terms that have a document\n    frequency strictly higher than the given threshold (corpus-specific\n    stop words).\n    If float in range [0.0, 1.0], the parameter represents a proportion of\n    documents, integer absolute counts.\n    This parameter is ignored if vocabulary is not None.\n\nmin_df : float or int, default=1\n    When building the vocabulary ignore terms that have a document\n    frequency strictly lower than the given threshold. This value is also\n    called cut-off in the literature.\n    If float in range of [0.0, 1.0], the parameter represents a proportion\n    of documents, integer absolute counts.\n    This parameter is ignored if vocabulary is not None.\n\nmax_features : int, default=None\n    If not None, build a vocabulary that only consider the top\n    max_features ordered by term frequency across the corpus.\n\n    This parameter is ignored if vocabulary is not None.\n\nvocabulary : Mapping or iterable, default=None\n    Either a Mapping (e.g., a dict) where keys are terms and values are\n    indices in the feature matrix, or an iterable over terms. If not\n    given, a vocabulary is determined from the input documents.\n\nbinary : bool, default=False\n    If True, all non-zero term counts are set to 1. This does not mean\n    outputs will have only 0/1 values, only that the tf term in tf-idf\n    is binary. (Set idf and normalization to False to get 0/1 outputs).\n\ndtype : dtype, default=float64\n    Type of the matrix returned by fit_transform() or transform().\n\nnorm : {'l1', 'l2'}, default='l2'\n    Each output row will have unit norm, either:\n\n    - 'l2': Sum of squares of vector elements is 1. The cosine\n      similarity between two vectors is their dot product when l2 norm has\n      been applied.\n    - 'l1': Sum of absolute values of vector elements is 1.\n      See :func:`preprocessing.normalize`.\n\nuse_idf : bool, default=True\n    Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\nsmooth_idf : bool, default=True\n    Smooth idf weights by adding one to document frequencies, as if an\n    extra document was seen containing every term in the collection\n    exactly once. Prevents zero divisions.\n\nsublinear_tf : bool, default=False\n    Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\nAttributes\n----------\nvocabulary_ : dict\n    A mapping of terms to feature indices.\n\nfixed_vocabulary_ : bool\n    True if a fixed vocabulary of term to indices mapping\n    is provided by the user.\n\nidf_ : array of shape (n_features,)\n    The inverse document frequency (IDF) vector; only defined\n    if ``use_idf`` is True.\n\nstop_words_ : set\n    Terms that were ignored because they either:\n\n      - occurred in too many documents (`max_df`)\n      - occurred in too few documents (`min_df`)\n      - were cut off by feature selection (`max_features`).\n\n    This is only available if no vocabulary was given.\n\nSee Also\n--------\nCountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\nTfidfTransformer : Performs the TF-IDF transformation from a provided\n    matrix of counts.\n\nNotes\n-----\nThe ``stop_words_`` attribute can get large and increase the model size\nwhen pickling. This attribute is provided only for introspection and can\nbe safely removed using delattr or set to None before pickling.\n\nExamples\n--------\n>>> from sklearn.feature_extraction.text import TfidfVectorizer\n>>> corpus = [\n...     'This is the first document.',\n...     'This document is the second document.',\n...     'And this is the third one.',\n...     'Is this the first document?',\n... ]\n>>> vectorizer = TfidfVectorizer()\n>>> X = vectorizer.fit_transform(corpus)\n>>> vectorizer.get_feature_names_out()\narray(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n       'this'], ...)\n>>> print(X.shape)\n(4, 9)"
        }
    ],
    "functions": [
        {
            "id": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__",
            "name": "__init__",
            "qname": "sklearn.feature_extraction.text.TfidfVectorizer.__init__",
            "decorators": [],
            "parameters": [
                {
                    "id": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df",
                    "name": "min_df",
                    "qname": "sklearn.feature_extraction.text.TfidfVectorizer.__init__.min_df",
                    "default_value": "1",
                    "assigned_by": "NAME_ONLY",
                    "is_public": true,
                    "docstring": {
                        "type": "float or int",
                        "description": "When building the vocabulary ignore terms that have a document\nfrequency strictly lower than the given threshold. This value is also\ncalled cut-off in the literature.\nIf float in range of [0.0, 1.0], the parameter represents a proportion\nof documents, integer absolute counts.\nThis parameter is ignored if vocabulary is not None."
                    },
                    "type": {}
                }
            ],
            "results": [],
            "is_public": true,
            "reexported_by": [],
            "description": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by\n:class:`TfidfTransformer`.\n\nRead more in the :ref:`User Guide <text_feature_extraction>`.",
            "docstring": ""
        }
    ]
}

Minimal Usage Store (optional)

Minimal Usage Store for `sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df`
{
    "schemaVersion": 1,
    "module_counts": {
        "sklearn/sklearn.feature_extraction.text": 18626
    },
    "class_counts": {
        "sklearn/sklearn.feature_extraction.text/TfidfVectorizer": 8587
    },
    "function_counts": {
        "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__": 3212
    },
    "parameter_counts": {
        "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df": 1028
    },
    "value_counts": {
        "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df": {
            "0": 41,
            "1": 2242,
            "2": 172,
            "3": 323,
            "4": 7,
            "5": 214,
            "6": 1,
            "7": 2,
            "9": 1,
            "10": 77,
            "15": 4,
            "20": 9,
            "23": 1,
            "25": 1,
            "30": 3,
            "50": 6,
            "100": 12,
            "150": 9,
            "200": 6,
            "0.0": 13,
            "0.01": 12,
            "9e-05": 10,
            "min_df": 10,
            "0.001": 6,
            "0.2": 4,
            "self.min_df": 3,
            "np.int(min_df * texts.shape[0])": 2,
            "0.005": 2,
            "0.1": 2,
            "0.05": 2,
            "MIN_DOCUMENT_FREQUENCY": 2,
            "1e-07": 1,
            "0.009": 1,
            "noOfocurance": 1,
            "0.15": 1,
            "model_question.vocabulary.min_count": 1,
            "0.0005": 1,
            "0.0001": 1,
            "MIN_DF_TF": 1,
            "NAME_MIN_DF": 1,
            "0.0025": 1,
            "min_df_opt": 1,
            "self.tfidf_min_df": 1,
            "mindf": 1
        }
    }
}

Suggested Solution (optional)

No response

Additional Context (optional)

image

The descriptions only says "If float... " which isn't a requirement. The boundry annotation is therefore wrongly added.

Metadata

Metadata

Assignees

No one assigned

    Labels

    @boundaryRelated to the @boundary annotationbug 🪲Something isn't workingwrong annotationAn annotation was generated automatically but is incorrect

    Type

    No type

    Projects

    Status

    ✔️ Done

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions