Corpus

Corpus

Container for a full Corpus with train/dev/test splits. Used to apply core functions to all datasets at once.

all: List[recon.types.Example] (property, readonly)

Return concatenation of train/dev/test datasets

Returns

Type Description
List[recon.types.Example] List[Example]: All Examples in Corpus

dev: List[recon.types.Example] (property, readonly)

Return train dev

Returns

Type Description
List[recon.types.Example] List[Example]: Train Examples

test: List[recon.types.Example] (property, readonly)

Return test dataset

Returns

Type Description
List[recon.types.Example] List[Example]: Test Examples

train: List[recon.types.Example] (property, readonly)

Return train dataset

Returns

Type Description
List[recon.types.Example] List[Example]: Train Examples

__init__(self, train, dev, test=None, example_store=None)

Show source code in recon/corpus.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
    def __init__(
        self, train: Dataset, dev: Dataset, test: Dataset = None, example_store: ExampleStore = None
    ):
        """Initialize a Corpus.

        Args:
            train (Dataset): List of examples for **train** set
            dev (Dataset): List of examples for **dev** set
            test (Dataset, optional): Defaults to None. List of examples for **test** set
        """
        if example_store is None:
            examples = train.data + dev.data
            if test:
                examples += test.data
            example_store = ExampleStore(examples)
        self.example_store = example_store

        if test is None:
            test = Dataset("test")

        for ds in (train, dev, test):
            ds.example_store = example_store

        self._train = train
        self._dev = dev
        self._test = test

Initialize a Corpus.

Parameters

Name Type Description Default
train Dataset List of examples for train set required
dev Dataset List of examples for dev set required
test Dataset Defaults to None. List of examples for test set None

apply(self, func, *args, **kwargs)

Show source code in recon/corpus.py
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    def apply(
        self, func: Callable[[List[Example], Any, Any], Any], *args: Any, **kwargs: Any
    ) -> CorpusApplyResult:
        """Apply a function to all datasets

        Args:
            func (Callable[[List[Example], Any, Any], Any]): 
                Function from an existing recon module that can operate on a List of examples

        Returns:
            CorpusApplyResult: CorpusApplyResult mapping dataset name to return type of func Callable
        """

        return CorpusApplyResult(
            train=func(self.train, *args, **kwargs),  # type: ignore
            dev=func(self.dev, *args, **kwargs),  # type: ignore
            test=func(self.test, *args, **kwargs),  # type: ignore
            all=func(self.all, *args, **kwargs),  # type: ignore
        )

Apply a function to all datasets

Parameters

Name Type Description Default
func Callable[[List[recon.types.Example], Any, Any], Any] Function from an existing recon module that can operate on a List of examples required

Returns

Type Description
CorpusApplyResult CorpusApplyResult: CorpusApplyResult mapping dataset name to return type of func Callable

apply_(self, operation, *args, **kwargs)

Show source code in recon/corpus.py
101
102
103
104
105
106
107
108
109
110
111
112
    def apply_(
        self, operation: Callable[[Any], OperationResult], *args: Any, **kwargs: Any
    ) -> None:
        """Apply a function to all data inplace.

        Args:
            operation (Callable[[Any], OperationResult]): Any operation that
                changes data in place. See recon.operations.registry.operations
        """
        self._train.apply_(operation, *args, **kwargs)
        self._dev.apply_(operation, *args, **kwargs)
        self._test.apply_(operation, *args, **kwargs)

Apply a function to all data inplace.

Parameters

Name Type Description Default
operation Callable[[Any], recon.types.OperationResult] Any operation that changes data in place. See recon.operations.registry.operations required

from_disk(data_dir, train_file='train.jsonl', dev_file='dev.jsonl', test_file='test.jsonl', loader_func=<function read_jsonl at 0x7f7e22c0e378>) (classmethod)

Show source code in recon/corpus.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    @classmethod
    def from_disk(
        cls,
        data_dir: Path,
        train_file: str = "train.jsonl",
        dev_file: str = "dev.jsonl",
        test_file: str = "test.jsonl",
        loader_func: Callable = read_jsonl,
    ) -> "Corpus":
        """Load Corpus from disk given a directory with files 
        named explicitly train.jsonl, dev.jsonl, and test.jsonl

        Args:
            data_dir (Path): directory to load from.
            train_file (str, optional): Filename of train data under data_dir. Defaults to train.jsonl.
            dev_file (str, optional): Filename of dev data under data_dir. Defaults to dev.jsonl.
            test_file (str, optional): Filename of test data under data_dir. Defaults to test.jsonl.
            loader_func (Callable, optional): Callable that reads a file and returns a List of examples. 
                Defaults to [read_jsonl][recon.loaders.read_jsonl]
        """
        data_dir = ensure_path(data_dir)

        train = Dataset("train").from_disk(data_dir / train_file)
        dev = Dataset("dev").from_disk(data_dir / dev_file)

        try:
            test = Dataset("test").from_disk(data_dir / test_file)
            corpus = cls(train, dev, test=test)
        except ValueError as e:
            corpus = cls(train, dev)
        return corpus

Load Corpus from disk given a directory with files named explicitly train.jsonl, dev.jsonl, and test.jsonl

Parameters

Name Type Description Default
data_dir Path directory to load from. required
train_file str Filename of train data under data_dir. Defaults to train.jsonl. 'train.jsonl'
dev_file str Filename of dev data under data_dir. Defaults to dev.jsonl. 'dev.jsonl'
test_file str Filename of test data under data_dir. Defaults to test.jsonl. 'test.jsonl'
loader_func Callable Callable that reads a file and returns a List of examples. Defaults to read_jsonl <function read_jsonl at 0x7f7e22c0e378>

pipe_(self, operations)

Show source code in recon/corpus.py
114
115
116
117
118
119
120
121
122
123
    def pipe_(self, operations: List[Union[str, OperationState]]) -> None:
        """Run a sequence of operations on each dataset.
        Calls Dataset.pipe_ for each dataset

        Args:
            operations (List[Union[str, OperationState]]): List of operations
        """
        self._train.pipe_(operations)
        self._dev.pipe_(operations)
        self._test.pipe_(operations)

Run a sequence of operations on each dataset. Calls Dataset.pipe_ for each dataset

Parameters

Name Type Description Default
operations List[Union[str, recon.types.OperationState]] List of operations required

to_disk(self, data_dir, force=False)

Show source code in recon/corpus.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
    def to_disk(self, data_dir: Path, force: bool = False) -> None:
        """Save Corpus to Disk

        Args:
            data_dir (Path): Directory to save data to
            force (bool): Force save to directory. Create parent directories
                or overwrite existing data.
        """
        data_dir = ensure_path(data_dir)
        state_dir = data_dir / ".recon"
        if force:
            data_dir.mkdir(parents=True, exist_ok=True)

            if not state_dir.exists():
                state_dir.mkdir(parents=True, exist_ok=True)

        self._train.to_disk(data_dir / "train.jsonl", force=force, save_examples=False)
        self._dev.to_disk(data_dir / "dev.jsonl", force=force, save_examples=False)
        if self._test:
            self._test.to_disk(data_dir / "test.jsonl", force=force, save_examples=False)

        self.example_store.to_disk(state_dir / "example_store.jsonl")

Save Corpus to Disk

Parameters

Name Type Description Default
data_dir Path Directory to save data to required
force bool Force save to directory. Create parent directories or overwrite existing data. False