This repository was archived by the owner on Nov 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
This repository was archived by the owner on Nov 23, 2024. It is now read-only.
Boundry annotation wrongly created: The description only described an if-case for the boundry, not a requirement. #35
Copy link
Copy link
Closed
Labels
@boundaryRelated to the @boundary annotationRelated to the @boundary annotationbug 🪲Something isn't workingSomething isn't workingwrong annotationAn annotation was generated automatically but is incorrectAn annotation was generated automatically but is incorrect
Description
URL Hash
#/sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df
Actual Annotation Type
@boundary
Actual Annotation Inputs
{
"target": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df",
"authors": [
"$autogen$"
],
"interval": {
"isDiscrete": false,
"lowerIntervalLimit": 0,
"lowerLimitType": 0,
"upperIntervalLimit": 1,
"upperLimitType": 0
}
}Expected Annotation Type
No annotation
Expected Annotation Inputs
Minimal API Data (optional)
Minimal API Data for `sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df`
{
"schemaVersion": 1,
"distribution": "scikit-learn",
"package": "sklearn",
"version": "1.1.1",
"modules": [
{
"id": "sklearn/sklearn.feature_extraction.text",
"name": "sklearn.feature_extraction.text",
"imports": [
{
"module": "array",
"alias": null
},
{
"module": "numbers",
"alias": null
},
{
"module": "numpy",
"alias": "np"
},
{
"module": "re",
"alias": null
},
{
"module": "scipy.sparse",
"alias": "sp"
},
{
"module": "unicodedata",
"alias": null
},
{
"module": "warnings",
"alias": null
}
],
"from_imports": [
{
"module": "collections",
"declaration": "defaultdict",
"alias": null
},
{
"module": "collections.abc",
"declaration": "Mapping",
"alias": null
},
{
"module": "functools",
"declaration": "partial",
"alias": null
},
{
"module": "operator",
"declaration": "itemgetter",
"alias": null
},
{
"module": "sklearn.base",
"declaration": "_OneToOneFeatureMixin",
"alias": null
},
{
"module": "sklearn.base",
"declaration": "BaseEstimator",
"alias": null
},
{
"module": "sklearn.base",
"declaration": "TransformerMixin",
"alias": null
},
{
"module": "sklearn.exceptions",
"declaration": "NotFittedError",
"alias": null
},
{
"module": "sklearn.feature_extraction._hash",
"declaration": "FeatureHasher",
"alias": null
},
{
"module": "sklearn.feature_extraction._stop_words",
"declaration": "ENGLISH_STOP_WORDS",
"alias": null
},
{
"module": "sklearn.preprocessing",
"declaration": "normalize",
"alias": null
},
{
"module": "sklearn.utils",
"declaration": "_IS_32BIT",
"alias": null
},
{
"module": "sklearn.utils.deprecation",
"declaration": "deprecated",
"alias": null
},
{
"module": "sklearn.utils.validation",
"declaration": "check_array",
"alias": null
},
{
"module": "sklearn.utils.validation",
"declaration": "check_is_fitted",
"alias": null
},
{
"module": "sklearn.utils.validation",
"declaration": "check_scalar",
"alias": null
},
{
"module": "sklearn.utils.validation",
"declaration": "FLOAT_DTYPES",
"alias": null
}
],
"classes": [
"sklearn/sklearn.feature_extraction.text/TfidfVectorizer"
],
"functions": []
}
],
"classes": [
{
"id": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer",
"name": "TfidfVectorizer",
"qname": "sklearn.feature_extraction.text.TfidfVectorizer",
"decorators": [],
"superclasses": [
"CountVectorizer"
],
"methods": [
"sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__"
],
"is_public": true,
"reexported_by": [],
"description": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by\n:class:`TfidfTransformer`.\n\nRead more in the :ref:`User Guide <text_feature_extraction>`.",
"docstring": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by\n:class:`TfidfTransformer`.\n\nRead more in the :ref:`User Guide <text_feature_extraction>`.\n\nParameters\n----------\ninput : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\nencoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\ndecode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\nstrip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\nlowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\npreprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer`` is not callable.\n\ntokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\nanalyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\nstop_words : {'english'}, list, default=None\n If a string, it is passed to _check_stop_list and the appropriate stop\n list is returned. 'english' is currently the only supported string\n value.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\ntoken_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\nngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer`` is not callable.\n\nmax_df : float or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float in range [0.0, 1.0], the parameter represents a proportion of\n documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\nmin_df : float or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float in range of [0.0, 1.0], the parameter represents a proportion\n of documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\nmax_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\nvocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents.\n\nbinary : bool, default=False\n If True, all non-zero term counts are set to 1. This does not mean\n outputs will have only 0/1 values, only that the tf term in tf-idf\n is binary. (Set idf and normalization to False to get 0/1 outputs).\n\ndtype : dtype, default=float64\n Type of the matrix returned by fit_transform() or transform().\n\nnorm : {'l1', 'l2'}, default='l2'\n Each output row will have unit norm, either:\n\n - 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n - 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`.\n\nuse_idf : bool, default=True\n Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\nsmooth_idf : bool, default=True\n Smooth idf weights by adding one to document frequencies, as if an\n extra document was seen containing every term in the collection\n exactly once. Prevents zero divisions.\n\nsublinear_tf : bool, default=False\n Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\nAttributes\n----------\nvocabulary_ : dict\n A mapping of terms to feature indices.\n\nfixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\nidf_ : array of shape (n_features,)\n The inverse document frequency (IDF) vector; only defined\n if ``use_idf`` is True.\n\nstop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\nSee Also\n--------\nCountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\nTfidfTransformer : Performs the TF-IDF transformation from a provided\n matrix of counts.\n\nNotes\n-----\nThe ``stop_words_`` attribute can get large and increase the model size\nwhen pickling. This attribute is provided only for introspection and can\nbe safely removed using delattr or set to None before pickling.\n\nExamples\n--------\n>>> from sklearn.feature_extraction.text import TfidfVectorizer\n>>> corpus = [\n... 'This is the first document.',\n... 'This document is the second document.',\n... 'And this is the third one.',\n... 'Is this the first document?',\n... ]\n>>> vectorizer = TfidfVectorizer()\n>>> X = vectorizer.fit_transform(corpus)\n>>> vectorizer.get_feature_names_out()\narray(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n>>> print(X.shape)\n(4, 9)"
}
],
"functions": [
{
"id": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__",
"name": "__init__",
"qname": "sklearn.feature_extraction.text.TfidfVectorizer.__init__",
"decorators": [],
"parameters": [
{
"id": "sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df",
"name": "min_df",
"qname": "sklearn.feature_extraction.text.TfidfVectorizer.__init__.min_df",
"default_value": "1",
"assigned_by": "NAME_ONLY",
"is_public": true,
"docstring": {
"type": "float or int",
"description": "When building the vocabulary ignore terms that have a document\nfrequency strictly lower than the given threshold. This value is also\ncalled cut-off in the literature.\nIf float in range of [0.0, 1.0], the parameter represents a proportion\nof documents, integer absolute counts.\nThis parameter is ignored if vocabulary is not None."
},
"type": {}
}
],
"results": [],
"is_public": true,
"reexported_by": [],
"description": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by\n:class:`TfidfTransformer`.\n\nRead more in the :ref:`User Guide <text_feature_extraction>`.",
"docstring": ""
}
]
}Minimal Usage Store (optional)
Minimal Usage Store for `sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df`
{
"schemaVersion": 1,
"module_counts": {
"sklearn/sklearn.feature_extraction.text": 18626
},
"class_counts": {
"sklearn/sklearn.feature_extraction.text/TfidfVectorizer": 8587
},
"function_counts": {
"sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__": 3212
},
"parameter_counts": {
"sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df": 1028
},
"value_counts": {
"sklearn/sklearn.feature_extraction.text/TfidfVectorizer/__init__/min_df": {
"0": 41,
"1": 2242,
"2": 172,
"3": 323,
"4": 7,
"5": 214,
"6": 1,
"7": 2,
"9": 1,
"10": 77,
"15": 4,
"20": 9,
"23": 1,
"25": 1,
"30": 3,
"50": 6,
"100": 12,
"150": 9,
"200": 6,
"0.0": 13,
"0.01": 12,
"9e-05": 10,
"min_df": 10,
"0.001": 6,
"0.2": 4,
"self.min_df": 3,
"np.int(min_df * texts.shape[0])": 2,
"0.005": 2,
"0.1": 2,
"0.05": 2,
"MIN_DOCUMENT_FREQUENCY": 2,
"1e-07": 1,
"0.009": 1,
"noOfocurance": 1,
"0.15": 1,
"model_question.vocabulary.min_count": 1,
"0.0005": 1,
"0.0001": 1,
"MIN_DF_TF": 1,
"NAME_MIN_DF": 1,
"0.0025": 1,
"min_df_opt": 1,
"self.tfidf_min_df": 1,
"mindf": 1
}
}
}Suggested Solution (optional)
No response
Additional Context (optional)
The descriptions only says "If float... " which isn't a requirement. The boundry annotation is therefore wrongly added.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
@boundaryRelated to the @boundary annotationRelated to the @boundary annotationbug 🪲Something isn't workingSomething isn't workingwrong annotationAn annotation was generated automatically but is incorrectAn annotation was generated automatically but is incorrect
Type
Projects
Status
✔️ Done
