C2_W3_Assignment Exercise 2: ExampleGen Error

Hellow!

4 - Data Pipeline

PIPELINE_DIR = ‘./pipeline’

WARNING:absl:InteractiveContext metadata_connection_config not provided: using SQLite ML Metadata database at ./pipeline/metadata.sqlite.

Exercise 2: ExampleGen

example_gen = CsvExampleGen(input_base=PIPELINE_DIR)
context.run(example_gen)


UnicodeDecodeError Traceback (most recent call last)
in
13
14 # Run the component using the InteractiveContext instance
—> 15 context.run(example_gen)
16
17 #context.show(example_gen.outputs[‘statistics’])

/opt/conda/lib/python3.8/site-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run_if_ipython(*args, **kwargs)
65 # IPYTHON variable is set by IPython, see
66 # IPython reference — IPython 0.10.2 documentation.
—> 67 return fn(*args, **kwargs)
68 else:
69 absl.logging.warning(

/opt/conda/lib/python3.8/site-packages/tensorflow/python/util/compat.py in as_text(bytes_or_text, encoding)
107 return bytes_or_text
108 elif isinstance(bytes_or_text, bytes):
→ 109 return bytes_or_text.decode(encoding)
110 else:
111 raise TypeError('Expected binary or unicode string, got r' bytes_or_text)

UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xf1 in position 98: invalid continuation byte

I’m also in trouble with this point.
Since ['input_base'] ./data was displayed in the correct cell, I filled None with DATA_DIR.

my code

example_gen = CsvExampleGen(input_base=DATA_DIR)

error code

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-16-55b5981e3d13> in <module>
      5 
      6 # Run the component using the InteractiveContext instance
----> 7 context.run(example_gen)
      8 
      9 ### END CODE HERE

/opt/conda/lib/python3.8/site-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run_if_ipython(*args, **kwargs)
     65       # __IPYTHON__ variable is set by IPython, see
     66       # https://ipython.org/ipython-doc/rel-0.10.2/html/interactive/reference.html#embedding-ipython.
---> 67       return fn(*args, **kwargs)
     68     else:
     69       absl.logging.warning(

/opt/conda/lib/python3.8/site-packages/tfx/orchestration/experimental/interactive/interactive_context.py in run(self, component, enable_cache, beam_pipeline_args)
    180         telemetry_utils.LABEL_TFX_RUNNER: runner_label,
    181     }):
--> 182       execution_id = launcher.launch().execution_id
    183 
    184     return execution_result.ExecutionResult(

/opt/conda/lib/python3.8/site-packages/tfx/orchestration/launcher/base_component_launcher.py in launch(self)
    200       absl.logging.info('Running executor for %s',
    201                         self._component_info.component_id)
--> 202       self._run_executor(execution_decision.execution_id,
    203                          execution_decision.input_dict,
    204                          execution_decision.output_dict,

/opt/conda/lib/python3.8/site-packages/tfx/orchestration/launcher/in_process_component_launcher.py in _run_executor(self, execution_id, input_dict, output_dict, exec_properties)
     65         executor_context)  # type: ignore
     66 
---> 67     executor.Do(input_dict, output_dict, exec_properties)

/opt/conda/lib/python3.8/site-packages/tfx/components/example_gen/base_example_gen_executor.py in Do(self, input_dict, output_dict, exec_properties)
    294     logging.info('Generating examples.')
    295     with self._make_beam_pipeline() as pipeline:
--> 296       example_splits = self.GenerateExamplesByBeam(pipeline, exec_properties)
    297 
    298       # pylint: disable=expression-not-assigned, no-value-for-parameter

/opt/conda/lib/python3.8/site-packages/tfx/components/example_gen/base_example_gen_executor.py in GenerateExamplesByBeam(self, pipeline, exec_properties)
    226         buckets.append(total_buckets)
    227       example_splits = (
--> 228           pipeline
    229           | 'InputToRecord' >>
    230           # pylint: disable=no-value-for-parameter

/opt/conda/lib/python3.8/site-packages/apache_beam/transforms/ptransform.py in __ror__(self, pvalueish, _unused)
   1056 
   1057   def __ror__(self, pvalueish, _unused=None):
-> 1058     return self.transform.__ror__(pvalueish, self.label)
   1059 
   1060   def expand(self, pvalue):

/opt/conda/lib/python3.8/site-packages/apache_beam/transforms/ptransform.py in __ror__(self, left, label)
    571     pvalueish = _SetInputPValues().visit(pvalueish, replacements)
    572     self.pipeline = p
--> 573     result = p.apply(self, pvalueish, label)
    574     if deferred:
    575       return result

/opt/conda/lib/python3.8/site-packages/apache_beam/pipeline.py in apply(self, transform, pvalueish, label)
    644       try:
    645         old_label, transform.label = transform.label, label
--> 646         return self.apply(transform, pvalueish)
    647       finally:
    648         transform.label = old_label

/opt/conda/lib/python3.8/site-packages/apache_beam/pipeline.py in apply(self, transform, pvalueish, label)
    687         transform.type_check_inputs(pvalueish)
    688 
--> 689       pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
    690 
    691       if type_options is not None and type_options.pipeline_type_check:

/opt/conda/lib/python3.8/site-packages/apache_beam/runners/runner.py in apply(self, transform, input, options)
    186       m = getattr(self, 'apply_%s' % cls.__name__, None)
    187       if m:
--> 188         return m(transform, input, options)
    189     raise NotImplementedError(
    190         'Execution of [%s] not implemented in runner %s.' % (transform, self))

/opt/conda/lib/python3.8/site-packages/apache_beam/runners/runner.py in apply_PTransform(self, transform, input, options)
    216   def apply_PTransform(self, transform, input, options):
    217     # The base case of apply is to call the transform's expand.
--> 218     return transform.expand(input)
    219 
    220   def run_transform(self,

/opt/conda/lib/python3.8/site-packages/apache_beam/transforms/ptransform.py in expand(self, pcoll)
    962       # Might not be a function.
    963       pass
--> 964     return self._fn(pcoll, *args, **kwargs)
    965 
    966   def default_label(self):

/opt/conda/lib/python3.8/site-packages/tfx/components/example_gen/csv_example_gen/executor.py in _CsvToExample(pipeline, exec_properties, split_pattern)
    139   parsed_csv_lines = (
    140       pipeline
--> 141       | 'ReadFromText' >> beam.io.ReadFromText(
    142           file_pattern=csv_pattern, skip_header_lines=1)
    143       | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))

/opt/conda/lib/python3.8/site-packages/apache_beam/io/textio.py in __init__(self, file_pattern, min_bundle_size, compression_type, strip_trailing_newlines, coder, validate, skip_header_lines, **kwargs)
    563 
    564     super(ReadFromText, self).__init__(**kwargs)
--> 565     self._source = self._source_class(
    566         file_pattern,
    567         min_bundle_size,

/opt/conda/lib/python3.8/site-packages/apache_beam/io/textio.py in __init__(self, file_pattern, min_bundle_size, compression_type, strip_trailing_newlines, coder, buffer_size, validate, skip_header_lines, header_processor_fns)
    125     of the arguments.
    126     """
--> 127     super(_TextSource, self).__init__(
    128         file_pattern,
    129         min_bundle_size,

/opt/conda/lib/python3.8/site-packages/apache_beam/io/filebasedsource.py in __init__(self, file_pattern, min_bundle_size, compression_type, splittable, validate)
    126     self._splittable = splittable
    127     if validate and file_pattern.is_accessible():
--> 128       self._validate()
    129 
    130   def display_data(self):

/opt/conda/lib/python3.8/site-packages/apache_beam/options/value_provider.py in _f(self, *args, **kwargs)
    198         if not obj.is_accessible():
    199           raise error.RuntimeValueProviderError('%s not accessible' % obj)
--> 200       return fnc(self, *args, **kwargs)
    201 
    202     return _f

/opt/conda/lib/python3.8/site-packages/apache_beam/io/filebasedsource.py in _validate(self)
    189     match_result = FileSystems.match([pattern], limits=[1])[0]
    190     if len(match_result.metadata_list) <= 0:
--> 191       raise IOError('No files found based on the file pattern %s' % pattern)
    192 
    193   def split(

OSError: No files found based on the file pattern ./data/*

Earlier in Exercise 1, the feature selection run created a subset of the original dataframe and save it to a new directory. In Exercise 2, the CsvExampleGen needs to target that new directory you had just created.

1 Like

I faced the same problem. Could you help me how to target that new directory?
Thank you