Following the scenario in Lab 2, we are to “verify that the curated schema is the one used for the ExampleValidator
run we will be investigating.” However, without having previous notebooks to know anything about the schema, how can you actually get the schema to understand what changes you might want to make because of the anomalies?
I follow the exercise easily enough to find the schema through the event execution id related to the anomaly artifact. In the end the artifact id of the schema is 4.
From there I try something like this:
mySchema = store.get_artifacts_by_id([4])[0]
#schema.uri would be ./pipeline//updated_schema. In this directory there's only one file.
schemaFile = !ls {mySchema.uri}
tfdv.load_anomalies_text(tfdv.load_schema_text(mySchema.uri + "/" + schemaFile[0]))
However this yields a type Error, and a non-pretty print version of the schema. Is there a better way?
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-34-fd7b6fc814ea> in <module>
1 mySchema = store.get_artifacts_by_id([4])[0]
2 schemaFile = get_ipython().getoutput('ls {mySchema.uri}')
----> 3 tfdv.load_anomalies_text(tfdv.load_schema_text(mySchema.uri + "/" + schemaFile[0]))
/opt/conda/lib/python3.8/site-packages/tensorflow_data_validation/utils/anomalies_util.py in load_anomalies_text(input_path)
145 """
146 anomalies = anomalies_pb2.Anomalies()
--> 147 anomalies_text = io_util.read_file_to_string(input_path)
148 text_format.Parse(anomalies_text, anomalies)
149 return anomalies
/opt/conda/lib/python3.8/site-packages/tensorflow_data_validation/utils/io_util.py in read_file_to_string(filename, binary_mode)
51 else:
52 f = tf.io.gfile.GFile(filename, mode="r")
---> 53 return f.read()
/opt/conda/lib/python3.8/site-packages/tensorflow/python/lib/io/file_io.py in read(self, n)
114 string if in string (regular) mode.
115 """
--> 116 self._preread_check()
117 if n == -1:
118 length = self.size() - self.tell()
/opt/conda/lib/python3.8/site-packages/tensorflow/python/lib/io/file_io.py in _preread_check(self)
76 raise errors.PermissionDeniedError(None, None,
77 "File isn't open for reading")
---> 78 self._read_buf = _pywrap_file_io.BufferedInputStream(
79 self.__name, 1024 * 512)
80
TypeError: __init__(): incompatible constructor arguments. The following argument types are supported:
1. tensorflow.python._pywrap_file_io.BufferedInputStream(arg0: str, arg1: int)
Invoked with: feature {
name: "age"
value_count {
min: 1
max: 1
}
type: INT
int_domain {
name: "age"
min: 17
max: 90
}
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "capital-gain"
value_count {
min: 1
max: 1
}
type: INT
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "capital-loss"
value_count {
min: 1
max: 1
}
type: INT
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "education"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "education"
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "education-num"
value_count {
min: 1
max: 1
}
type: INT
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "fnlwgt"
value_count {
min: 1
max: 1
}
type: INT
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "hours-per-week"
value_count {
min: 1
max: 1
}
type: INT
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "label"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "label"
presence {
min_fraction: 1.0
min_count: 1
}
not_in_environment: "SERVING"
}
feature {
name: "marital-status"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "marital-status"
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "native-country"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "native-country"
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "occupation"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "occupation"
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "race"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "race"
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "relationship"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "relationship"
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "sex"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "sex"
presence {
min_fraction: 1.0
min_count: 1
}
}
feature {
name: "workclass"
value_count {
min: 1
max: 1
}
type: BYTES
domain: "workclass"
presence {
min_fraction: 1.0
min_count: 1
}
}
string_domain {
name: "education"
value: " 10th"
value: " 11th"
value: " 12th"
value: " 1st-4th"
value: " 5th-6th"
value: " 7th-8th"
value: " 9th"
value: " Assoc-acdm"
value: " Assoc-voc"
value: " Bachelors"
value: " Doctorate"
value: " HS-grad"
value: " Masters"
value: " Preschool"
value: " Prof-school"
value: " Some-college"
}
string_domain {
name: "label"
value: " <=50K"
value: " >50K"
}
string_domain {
name: "marital-status"
value: " Divorced"
value: " Married-AF-spouse"
value: " Married-civ-spouse"
value: " Married-spouse-absent"
value: " Never-married"
value: " Separated"
value: " Widowed"
}
string_domain {
name: "native-country"
value: " ?"
value: " Cambodia"
value: " Canada"
value: " China"
value: " Columbia"
value: " Cuba"
value: " Dominican-Republic"
value: " Ecuador"
value: " El-Salvador"
value: " England"
value: " France"
value: " Germany"
value: " Greece"
value: " Guatemala"
value: " Haiti"
value: " Honduras"
value: " Hong"
value: " Hungary"
value: " India"
value: " Iran"
value: " Ireland"
value: " Italy"
value: " Jamaica"
value: " Japan"
value: " Laos"
value: " Mexico"
value: " Nicaragua"
value: " Outlying-US(Guam-USVI-etc)"
value: " Peru"
value: " Philippines"
value: " Poland"
value: " Portugal"
value: " Puerto-Rico"
value: " Scotland"
value: " South"
value: " Taiwan"
value: " Thailand"
value: " Trinadad&Tobago"
value: " United-States"
value: " Vietnam"
value: " Yugoslavia"
value: " Holand-Netherlands"
}
string_domain {
name: "occupation"
value: " ?"
value: " Adm-clerical"
value: " Armed-Forces"
value: " Craft-repair"
value: " Exec-managerial"
value: " Farming-fishing"
value: " Handlers-cleaners"
value: " Machine-op-inspct"
value: " Other-service"
value: " Priv-house-serv"
value: " Prof-specialty"
value: " Protective-serv"
value: " Sales"
value: " Tech-support"
value: " Transport-moving"
}
string_domain {
name: "race"
value: " Amer-Indian-Eskimo"
value: " Asian-Pac-Islander"
value: " Black"
value: " Other"
value: " White"
}
string_domain {
name: "relationship"
value: " Husband"
value: " Not-in-family"
value: " Other-relative"
value: " Own-child"
value: " Unmarried"
value: " Wife"
}
string_domain {
name: "sex"
value: " Female"
value: " Male"
}
string_domain {
name: "workclass"
value: " ?"
value: " Federal-gov"
value: " Local-gov"
value: " Never-worked"
value: " Private"
value: " Self-emp-inc"
value: " Self-emp-not-inc"
value: " State-gov"
value: " Without-pay"
}
default_environment: "TRAINING"
default_environment: "SERVING"
, 524288```