#!/usr/bin/env python """ Parse all files and write to a single file """ import os from pathlib import Path from typing import List, NamedTuple from labml import logger, monit from parser import tokenizer from parser.tokenizer import encode, parse_string COMMENT = '#' MULTI_COMMENT = '"""' class _PythonFile(NamedTuple): relative_path: str project: str path: Path class _GetPythonFiles: """ Get list of python files and their paths inside `data/source` folder """ def __init__(self): self.source_path = Path(os.getcwd()) / 'data' / 'source' self.files: List[_PythonFile] = [] self.get_python_files(self.source_path) logger.inspect([f.path for f in self.files]) def add_file(self, path: Path): """ Add a file to the list of tiles """ project = path.relative_to(self.source_path).parents project = project[len(project) - 2] relative_path = path.relative_to(self.source_path / project) self.files.append(_PythonFile(relative_path=str(relative_path), project=str(project), path=path)) def get_python_files(self, path: Path): """ Recursively collect files """ for p in path.iterdir(): if p.is_dir(): self.get_python_files(p) else: if p.suffix == '.py': self.add_file(p) def _fix_indentation(parsed: List[tokenizer.ParsedToken]) -> List[tokenizer.ParsedToken]: """ Change indentation tokens. Remove `DEDENT` tokens and add `INDENT` tokens to each line. This is easier for prediction. """ res: List[tokenizer.ParsedToken] = [] indentation = 0 indented = False for t in parsed: if t.type == tokenizer.TokenType.indent: indentation += 1 elif t.type == tokenizer.TokenType.dedent: indentation -= 1 elif t.type in [tokenizer.TokenType.new_line, tokenizer.TokenType.eof]: indented = False res.append(t) else: if not indented: for _ in range(indentation): res.append(tokenizer.ParsedToken(tokenizer.TokenType.indent, 0)) indented = True res.append(t) return res def _remove_comments(parsed: List[tokenizer.ParsedToken]) -> List[tokenizer.ParsedToken]: """ Remove comment tokens """ res = [] for p in parsed: if p.type == tokenizer.TokenType.comment: continue else: res.append(p) return res def _remove_empty_lines(parsed: List[tokenizer.ParsedToken]) -> List[tokenizer.ParsedToken]: """ Remove empty lines """ tokens = [tokenizer.TokenType.new_line, tokenizer.TokenType.new_line] res = [] for p in parsed: for i in range(1): tokens[i] = tokens[i + 1] tokens[-1] = p.type all_new_line = True for t in tokens: if t != tokenizer.TokenType.new_line: all_new_line = False if all_new_line: continue else: res.append(p) return res def _read_file(path: Path) -> List[int]: """ Read and encode a file """ with open(str(path)) as f: content = f.read() parsed = parse_string(content) parsed = _remove_comments(parsed) parsed = _remove_empty_lines(parsed) parsed = _fix_indentation(parsed) serialized = encode(parsed) # deserialized = tokenizer.deserialize(serialized) # for i in range(len(serialized)): # assert deserialized[i] == parsed[i] # # res = to_text(deserialized) # print(res) return serialized def main(): source_files = _GetPythonFiles().files logger.inspect(source_files) with open(str(Path(os.getcwd()) / 'data' / 'all.py'), 'w') as f: for i, source in monit.enum("Parse", source_files): serialized = _read_file(source.path) # return serialized = [str(t) for t in serialized] f.write(f"{str(source.path)}\n") f.write(" ".join(serialized) + "\n") if __name__ == '__main__': main()