8000 Sync with latest updates by nicolay-r · Pull Request #438 · nicolay-r/AREkit · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Sync with latest updates #438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ Please follows th

## Applications

* **AREnets** [[github]](https://github.com/nicolay-r/AREnets)
* is an OpenNRE like project, but the kernel based on tensorflow library, with implementation of neural networks on top of it, designed for Attitude
* **ARElight** [[site]](https://nicolay-r.github.io/arelight-page/) [[github]](https://github.com/nicolay-r/ARElight)
* **Infer attitudes** from large Mass-media documents or **sample texts** for your Machine Learning models applications

Expand Down
1 change: 1 addition & 0 deletions arekit/contrib/networks/input/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
SynonymObject = "syn_objs"
SynonymSubject = "syn_subjs"
PosTags = "pos_tags"
Text = "text"

ArgsSep = ','
51 changes: 33 additions & 18 deletions arekit/contrib/networks/input/rows_parser.py
8000
Original file line number Diff line number Diff line change
@@ -1,24 +1,35 @@
import pandas as pd

from arekit.common.data import const
from arekit.common.utils import filter_whitespaces, split_by_whitespaces
from . import const as network_input_const

import arekit.contrib.networks.input.const as network_input_const

empty_list = []


def no_value():
return None


def __process_values_list(value):
return value.split(network_input_const.ArgsSep)


def __process_indices_list(value):
return [int(v) for v in str(value).split(network_input_const.ArgsSep)]
return no_value() if not value else [int(v) for v in str(value).split(network_input_const.ArgsSep)]


def __process_int_values_list(value):
return __process_indices_list(value)


def __handle_text(value):
""" The core method of the input text processing.
"""
assert(isinstance(value, str) or isinstance(value, list))
return filter_whitespaces([term for term in split_by_whitespaces(value)]
if isinstance(value, str) else value)


parse_value = {
const.ID: lambda value: value,
const.DOC_ID: lambda value: int(value),
Expand All @@ -35,18 +46,19 @@ def __process_int_values_list(value):
network_input_const.SynonymObject: lambda value: __process_indices_list(value),
network_input_const.SynonymSubject: lambda value: __process_indices_list(value),
network_input_const.PosTags: lambda value: __process_int_values_list(value),
"text_a": lambda value: filter_whitespaces([term for term in split_by_whitespaces(value)])
network_input_const.Text: lambda value: __handle_text(value)
}


class ParsedSampleRow(object):
"""
Provides a parsed information for a sample row.
TODO. Use this class as API
""" Provides a parsed information for a sample row.
"""

def __init__(self, row):
assert(isinstance(row, pd.Series))
""" row: dict
dict of the pairs ("field_name", value)
"""
assert(isinstance(row, dict))

self.__uint_label = None
self.__params = {}
Expand All @@ -64,13 +76,16 @@ def __init__(self, row):

self.__params[key] = parse_value[key](value)

def __value_or_none(self, key):
return self.__params[key] if key in self.__par 8000 ams else no_value()

@property
def SampleID(self):
return self.__params[const.ID]

@property
def Terms(self):
return self.__params["text_a"]
return self.__params[network_input_const.Text]

@property
def SubjectIndex(self):
Expand All @@ -86,33 +101,33 @@ def UintLabel(self):

@property
def PartOfSpeechTags(self):
return self.__params[network_input_const.PosTags]
return self.__value_or_none(network_input_const.PosTags)

@property
def TextFrameVariantIndices(self):
return self.__params[network_input_const.FrameVariantIndices]
return self.__value_or_none(network_input_const.FrameVariantIndices)

@property
def TextFrameConnotations(self):
return self.__params[network_input_const.FrameConnotations]
return self.__value_or_none(network_input_const.FrameConnotations)

@property
def EntityInds(self):
return self.__params[const.ENTITIES]
return self.__value_or_none(const.ENTITIES)

@property
def SynonymObjectInds(self):
return self.__params[network_input_const.SynonymObject]
return self.__value_or_none(network_input_const.SynonymObject)

@property
def SynonymSubjectInds(self):
return self.__params[network_input_const.SynonymSubject]
return self.__value_or_none(network_input_const.SynonymSubject)

def __getitem__(self, item):
assert (isinstance(item, str) or item is None)
if item not in self.__params:
return N ED48 one
return self.__params[item] if item is not None else None
return no_value()
return self.__params[item] if item is not None else no_value()

@classmethod
def parse(cls, row):
Expand Down
37 changes: 22 additions & 15 deletions arekit/contrib/source/brat/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,40 @@ def __non_prefixed_id(value):

@staticmethod
def handle_entity(args):
""" T2 Location 10 23 South America
T1 Location 0 5;16 23 North America
"""
assert(len(args) == 3)

if len(args) < 4:
return None
e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
entity_params = args[1].split()

if not str.isdigit(args[2]) or not str.isdigit(args[3]):
if len(entity_params) > 3:
# We do not support the case of a non-continuous entity mentions.
return None

e_id = int(BratAnnotationParser.__non_prefixed_id(args[0]))
e_str_type = args[1]
e_begin = int(args[2])
e_end = int(args[3])
e_value = " ".join([arg.strip().replace(',', '') for arg in args[4:]])
e_str_type, e_begin, e_end = entity_params

return BratEntity(id_in_doc=e_id,
e_type=e_str_type,
index_begin=e_begin,
index_end=e_end,
value=e_value)
index_begin=int(e_begin),
index_end=int(e_end),
value=args[2].strip())

@staticmethod
def handle_relation(args):
""" Example:
R1 Origin Arg1:T3 Arg2:T4
"""

# Parse identifier index.
e_id = args[0][1:]

rel_type = args[1]
source_id = args[2].split(':')[1]
target_id = args[3].split(':')[1]
# Parse relation arguments.
rel_type, source, target = args[1].split()

source_id = source.split(':')[1]
target_id = target.split(':')[1]

return BratRelation(id_in_doc=e_id,
source_id=int(BratAnnotationParser.__non_prefixed_id(source_id)),
Expand All @@ -57,7 +64,7 @@ def parse_annotations(input_file, encoding='utf-8'):
for line in input_file.readlines():
line = line.decode(encoding)

args = line.split()
args = line.split('\t')

record_type = args[0][0]

Expand Down
15 changes: 15 additions & 0 deletions arekit/contrib/utils/data/readers/jsonl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from arekit.contrib.utils.data.readers.base import BaseReader
from arekit.contrib.utils.data.storages.jsonl_based import JsonlBasedRowsStorage


class JsonlReader(BaseReader):

def read(self, target):
rows = []
with open(target, "r") as f:
for line in f.readlines():
rows.append(line)
return JsonlBasedRowsStorage(rows)

def target_extension(self):
return ".jsonl"
18 changes: 18 additions & 0 deletions arekit/contrib/utils/data/storages/jsonl_based.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import json

from arekit.common.data.storages.base import BaseRowsStorage


class JsonlBasedRowsStorage(BaseRowsStorage):

def __init__(self, rows):
assert(isinstance(rows, list))
self.__rows = rows

def _iter_rows(self):
for row_index, row in enumerate(self.__rows):
assert(isinstance(row, str))
yield row_index, json.loads(row)

def _get_rows_count(self):
return len(self.__rows)
5 changes: 2 additions & 3 deletions arekit/contrib/utils/evaluation/analyze_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,12 @@ def extract_errors(eval_result, test_samples_filepath, etalon_samples_filepath,
for sample_col in columns_to_copy:
eval_errors_df.at[row_id, sample_col] = sample_row[sample_col]

text_terms =__post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind)
text_terms = __post_text_processing(sample_row=sample_row, source_ind=source_ind, target_ind=target_ind)
cropped_text = __crop_text_terms(source_ind=source_ind, target_ind=target_ind, text_terms=text_terms)

eval_errors_df.at[row_id, BaseSingleTextProvider.TEXT_A] = cropped_text

# Replace with the values instead of indices.
entity_inds = __get_entity_inds(sample_row)
# Replace source and target the values instead of indices.
eval_errors_df.at[row_id, const.S_IND] = text_terms[source_ind]
eval_errors_df.at[row_id, const.T_IND] = text_terms[target_ind]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def create_text_opinion_extraction_pipeline(text_parser,
version=version,
doc_id_func=lambda doc_id: doc_id,
keep_doc_ids_only=False,
label_scaler=label_scaler,
limit=limit)

doc_ops = DictionaryBasedDocumentOperations(ru_attitudes)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_doc(self, doc_id):
return self.__ru_attitudes[doc_id]


def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, label_scaler, limit=None):
def read_ruattitudes_to_brat_in_memory(version, keep_doc_ids_only, doc_id_func, limit=None):
""" Performs reading of RuAttitude formatted documents and
selection according to 'doc_ids_set' parameter.
"""
Expand Down
0