Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[text analytics] add normalized_text #17074

Merged
merged 2 commits into from
Mar 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
- Renamed properties `aspect` and `opinions` to `target` and `assessments` respectively in class `MinedOpinion`.
- Renamed classes `AspectSentiment` and `OpinionSentiment` to `TargetSentiment` and `AssessmentSentiment` respectively.

**New Features**

- Add property `normalized_text` to `HealthcareEntity`. This property is a normalized version of the `text` property that already
exists on the `HealthcareEntity`

## 5.1.0b5 (2021-02-10)

**Breaking Changes**
Expand Down
1 change: 1 addition & 0 deletions sdk/textanalytics/azure-ai-textanalytics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ print("Results of Healthcare Entities Analysis:")
for idx, doc in enumerate(docs):
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("...Normalized Text: {}".format(entity.normalized_text))
print("...Category: {}".format(entity.category))
print("...Subcategory: {}".format(entity.subcategory))
print("...Offset: {}".format(entity.offset))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,9 @@ def __repr__(self):
class HealthcareEntity(DictMixin):
"""HealthcareEntity contains information about a Healthcare entity found in text.

:ivar str text: Entity text as appears in the request.
:ivar str text: Entity text as appears in the document.
:ivar str normalized_text: Optional. Normalized version of the raw `text` we extract
from the document. Not all `text`s have a normalized version.
:ivar str category: Entity category, see the following link for health's named
entity types: https://aka.ms/text-analytics-health-entities
:ivar str subcategory: Entity subcategory.
Expand All @@ -510,6 +512,7 @@ class HealthcareEntity(DictMixin):

def __init__(self, **kwargs):
self.text = kwargs.get("text", None)
self.normalized_text = kwargs.get("normalized_text", None)
self.category = kwargs.get("category", None)
self.subcategory = kwargs.get("subcategory", None)
self.length = kwargs.get("length", None)
Expand All @@ -521,6 +524,7 @@ def __init__(self, **kwargs):
def _from_generated(cls, healthcare_entity):
return cls(
text=healthcare_entity.text,
normalized_text=healthcare_entity.name,
category=healthcare_entity.category,
subcategory=healthcare_entity.subcategory,
length=healthcare_entity.length,
Expand All @@ -535,9 +539,10 @@ def __hash__(self):
return hash(repr(self))

def __repr__(self):
return "HealthcareEntity(text={}, category={}, subcategory={}, length={}, offset={}, confidence_score={}, "\
"data_sources={})".format(
return "HealthcareEntity(text={}, normalized_text={}, category={}, subcategory={}, length={}, offset={}, "\
"confidence_score={}, data_sources={})".format(
self.text,
self.normalized_text,
self.category,
self.subcategory,
self.length,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ async def analyze_healthcare_entities_async(self):
for idx, doc in enumerate(docs):
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("...Normalized Text: {}".format(entity.normalized_text))
print("...Category: {}".format(entity.category))
print("...Subcategory: {}".format(entity.subcategory))
print("...Offset: {}".format(entity.offset))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def analyze_healthcare_entities(self):
for idx, doc in enumerate(docs):
for entity in doc.entities:
print("Entity: {}".format(entity.text))
print("...Normalized Text: {}".format(entity.normalized_text))
print("...Category: {}".format(entity.category))
print("...Subcategory: {}".format(entity.subcategory))
print("...Offset: {}".format(entity.offset))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "patients must have histologically confirmed
NHL", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '105'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
response:
body:
string: ''
headers:
apim-request-id:
- 1c4b7bf9-4eaf-41c1-8c28-585fd380d751
date:
- Wed, 03 Mar 2021 21:46:23 GMT
operation-location:
- https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '338'
status:
code: 202
message: Accepted
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:24Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"notStarted","errors":[]}'
headers:
apim-request-id:
- 57339114-5845-4f08-ab4d-0aa36c843d25
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:28 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '146'
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:32Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"running","errors":[]}'
headers:
apim-request-id:
- 417f0558-5abd-49fd-8cd7-32f2d03549bd
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:33 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '122'
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:32Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"running","errors":[]}'
headers:
apim-request-id:
- 54ddb168-5bcc-4610-86b4-1b02d2241bd5
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:39 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '87'
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
response:
body:
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:43Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"succeeded","errors":[],"results":{"documents":[{"id":"0","entities":[{"offset":19,"length":14,"text":"histologically","category":"ExaminationName","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0344441"},{"dataSource":"CHV","id":"0000030964"},{"dataSource":"LNC","id":"MTHU010496"},{"dataSource":"MDR","id":"10062005"},{"dataSource":"MTH","id":"U002823"},{"dataSource":"MTHMST","id":"MT140012"},{"dataSource":"NCI","id":"C49131"},{"dataSource":"SNOMEDCT_US","id":"714797009"}]},{"offset":44,"length":3,"text":"NHL","category":"Diagnosis","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0024305"},{"dataSource":"BI","id":"BI00323"},{"dataSource":"CCPSS","id":"0001640"},{"dataSource":"CCS","id":"2.10.2"},{"dataSource":"CCSR_10","id":"NEO058"},{"dataSource":"CHV","id":"0000007621"},{"dataSource":"COSTAR","id":"U000045"},{"dataSource":"CSP","id":"4001-0094"},{"dataSource":"DXP","id":"U002830"},{"dataSource":"HPO","id":"HP:0012539"},{"dataSource":"ICD10","id":"C85.9"},{"dataSource":"ICD10AM","id":"M9672/3"},{"dataSource":"ICD10CM","id":"C85.9"},{"dataSource":"ICPC2ICD10ENG","id":"MTHU053464"},{"dataSource":"ICPC2P","id":"B74002"},{"dataSource":"MDR","id":"10029547"},{"dataSource":"MEDCIN","id":"35839"},{"dataSource":"MEDLINEPLUS","id":"117"},{"dataSource":"MSH","id":"D008228"},{"dataSource":"NCI","id":"C3211"},{"dataSource":"NCI_CELLOSAURUS","id":"C3211"},{"dataSource":"NCI_CPTAC","id":"C3211"},{"dataSource":"NCI_CTEP-SDC","id":"10029593"},{"dataSource":"NCI_CTRP","id":"C3211"},{"dataSource":"NCI_GDC","id":"C3211"},{"dataSource":"NCI_NCI-GLOSS","id":"CDR0000045148"},{"dataSource":"NCI_NICHD","id":"C3211"},{"dataSource":"OMIM","id":"MTHU014311"},{"dataSource":"PDQ","id":"CDR0000038957"},{"dataSource":"QMR","id":"R0121804"},{"dataSource":"RCD","id":"B627."},{"dataSource":"SNM","id":"M-YYX54"},{"dataSource":"SNMI","id":"M-96723"},{"dataSource":"SNOMEDCT_US","id":"1929004"},{"dataSource":"WHO","id":"1544"}]}],"relations":[],"warnings":[]}],"errors":[],"modelVersion":"2021-01-11"}}'
headers:
apim-request-id:
- 356495ad-d24a-4870-ae9a-3bc03cdc951b
content-type:
- application/json; charset=utf-8
date:
- Wed, 03 Mar 2021 21:46:45 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '302'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "patients must have histologically confirmed
NHL", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '105'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
response:
body:
string: ''
headers:
apim-request-id: 5f62849b-975a-4da4-8d9f-359e2b7af6d4
date: Wed, 03 Mar 2021 21:46:45 GMT
operation-location: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '75'
status:
code: 202
message: Accepted
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
- request:
body: null
headers:
User-Agent:
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
method: GET
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
response:
body:
string: '{"jobId":"4be5a0c6-2663-46d8-ba56-ffeefe175b9b","lastUpdateDateTime":"2021-03-03T21:46:48Z","createdDateTime":"2021-03-03T21:46:45Z","expirationDateTime":"2021-03-04T21:46:45Z","status":"succeeded","errors":[],"results":{"documents":[{"id":"0","entities":[{"offset":19,"length":14,"text":"histologically","category":"ExaminationName","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0344441"},{"dataSource":"CHV","id":"0000030964"},{"dataSource":"LNC","id":"MTHU010496"},{"dataSource":"MDR","id":"10062005"},{"dataSource":"MTH","id":"U002823"},{"dataSource":"MTHMST","id":"MT140012"},{"dataSource":"NCI","id":"C49131"},{"dataSource":"SNOMEDCT_US","id":"714797009"}]},{"offset":44,"length":3,"text":"NHL","category":"Diagnosis","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0024305"},{"dataSource":"BI","id":"BI00323"},{"dataSource":"CCPSS","id":"0001640"},{"dataSource":"CCS","id":"2.10.2"},{"dataSource":"CCSR_10","id":"NEO058"},{"dataSource":"CHV","id":"0000007621"},{"dataSource":"COSTAR","id":"U000045"},{"dataSource":"CSP","id":"4001-0094"},{"dataSource":"DXP","id":"U002830"},{"dataSource":"HPO","id":"HP:0012539"},{"dataSource":"ICD10","id":"C85.9"},{"dataSource":"ICD10AM","id":"M9672/3"},{"dataSource":"ICD10CM","id":"C85.9"},{"dataSource":"ICPC2ICD10ENG","id":"MTHU053464"},{"dataSource":"ICPC2P","id":"B74002"},{"dataSource":"MDR","id":"10029547"},{"dataSource":"MEDCIN","id":"35839"},{"dataSource":"MEDLINEPLUS","id":"117"},{"dataSource":"MSH","id":"D008228"},{"dataSource":"NCI","id":"C3211"},{"dataSource":"NCI_CELLOSAURUS","id":"C3211"},{"dataSource":"NCI_CPTAC","id":"C3211"},{"dataSource":"NCI_CTEP-SDC","id":"10029593"},{"dataSource":"NCI_CTRP","id":"C3211"},{"dataSource":"NCI_GDC","id":"C3211"},{"dataSource":"NCI_NCI-GLOSS","id":"CDR0000045148"},{"dataSource":"NCI_NICHD","id":"C3211"},{"dataSource":"OMIM","id":"MTHU014311"},{"dataSource":"PDQ","id":"CDR0000038957"},{"dataSource":"QMR","id":"R0121804"},{"dataSource":"RCD","id":"B627."},{"dataSource":"SNM","id":"M-YYX54"},{"dataSource":"SNMI","id":"M-96723"},{"dataSource":"SNOMEDCT_US","id":"1929004"},{"dataSource":"WHO","id":"1544"}]}],"relations":[],"warnings":[]}],"errors":[],"modelVersion":"2021-01-11"}}'
headers:
apim-request-id: f9b79e8f-3fa1-4623-99b1-bf925c6b3b60
content-type: application/json; charset=utf-8
date: Wed, 03 Mar 2021 21:46:50 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '30'
status:
code: 200
message: OK
url: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,17 @@ def test_relations(self, client):
else:
assert role.name == HealthcareEntityRelationRoleType.ABBREVIATED_TERM
self.assert_healthcare_entities_equal(role.entity, parkinsons_abbreviation_entity)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_normalized_text(self, client):
result = list(client.begin_analyze_healthcare_entities(
documents=["patients must have histologically confirmed NHL"]
).result())

# currently just testing it has that attribute.
# have an issue to update https://github.com/Azure/azure-sdk-for-python/issues/17072

assert all([
e for e in result[0].entities if hasattr(e, "normalized_text")
])
Original file line number Diff line number Diff line change
Expand Up @@ -420,4 +420,22 @@ async def test_relations(self, client):
assert role.name == "AbbreviatedTerm"
self.assert_healthcare_entities_equal(role.entity, parkinsons_abbreviation_entity)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_normalized_text(self, client):
response = await (await client.begin_analyze_healthcare_entities(
documents=["patients must have histologically confirmed NHL"]
)).result()

result = []
async for r in response:
result.append(r)

# currently just testing it has that attribute.
# have an issue to update https://github.com/Azure/azure-sdk-for-python/issues/17072

assert all([
e for e in result[0].entities if hasattr(e, "normalized_text")
])


3 changes: 2 additions & 1 deletion sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def data_source():
def healthcare_entity(data_source):
model = _models.HealthcareEntity(
text="Bonjour",
normalized_text="Bonjour",
category="MyCategory",
subcategory="MySubcategory",
length=7,
Expand All @@ -286,7 +287,7 @@ def healthcare_entity(data_source):
data_sources=[data_source[0]],
)
model_repr = (
"HealthcareEntity(text=Bonjour, category=MyCategory, subcategory=MySubcategory, length=7, offset=12, " +
"HealthcareEntity(text=Bonjour, normalized_text=Bonjour, category=MyCategory, subcategory=MySubcategory, length=7, offset=12, " +
"confidence_score=0.95, data_sources=[{}])".format(data_source[1])
)

Expand Down